Date: (Fri) Nov 20, 2015
Data: Source: Training: https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTrain.csv
New: https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTest.csv
Time period:
Based on analysis utilizing <> techniques,
Summary of key steps & error improvement stats:
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
rm(list = ls())
set.seed(12345)
options(stringsAsFactors = FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
source("~/Dropbox/datascience/R/mytm.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
registerDoMC(6) # # of cores on machine - 2
suppressPackageStartupMessages(require(caret))
#source("dbgcaret.R")
#packageVersion("snow")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
# Inputs
glb_trnng_url <- "https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTrain.csv"
glb_newdt_url <- "https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTest.csv"
glbInpMerge <- NULL #: default
# list(fnames = c("<fname1>", "<fname2>")) # files will be concatenated
glb_is_separate_newobs_dataset <- TRUE # or TRUE
glb_split_entity_newobs_datasets <- FALSE # select from c(FALSE, TRUE)
glb_split_newdata_method <- NULL # select from c(NULL, "condition", "sample", "copy")
glb_split_newdata_condition <- NULL # or "is.na(<var>)"; "<var> <condition_operator> <value>"
glb_split_newdata_size_ratio <- 0.3 # > 0 & < 1
glb_split_sample.seed <- 123 # or any integer
glbObsDropCondition <- NULL # : default
# "<condition>" # use | & ; NOT || &&
#parse(text=glbObsDropCondition)
#subset(glbObsAll, .grpid %in% c(31))
glb_obs_repartition_train_condition <- NULL # : default
# "<condition>"
glb_max_fitobs <- NULL # or any integer
glb_is_regression <- FALSE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- TRUE # or TRUE or FALSE
glb_rsp_var_raw <- "Popular"
# for classification, the response variable has to be a factor
glb_rsp_var <- "Popular.fctr" # glb_rsp_var_raw # or "Popular.fctr"
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- #NULL
function(raw) {
# return(raw ^ 0.5)
# return(log(1 + raw))
# return(log10(raw))
# return(exp(-raw / 2))
ret_vals <- rep_len(NA, length(raw)); ret_vals[!is.na(raw)] <- ifelse(raw[!is.na(raw)] == 1, "Y", "N"); return(relevel(as.factor(ret_vals), ref="N"))
# #as.factor(paste0("B", raw))
# #as.factor(gsub(" ", "\\.", raw))
}
# glb_map_rsp_raw_to_var(tst <- c(NA, 0, 1))
# glb_map_rsp_raw_to_var(tst <- c(NA, 0, 2.99, 280.50, 1000.00))
glb_map_rsp_var_to_raw <- #NULL
function(var) {
# return(var ^ 2.0)
# return(exp(var))
# return(10 ^ var)
# return(-log(var) * 2)
as.numeric(var) - 1
# gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
# c(FALSE, TRUE)[as.numeric(var)]
}
# glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(tst))
if ((glb_rsp_var != glb_rsp_var_raw) && is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
# List info gathered for various columns
# <col_name>: <description>; <notes>
# NewsDesk = the New York Times desk that produced the story (Business, Culture, Foreign, etc.)
# SectionName = the section the article appeared in (Opinion, Arts, Technology, etc.)
# SubsectionName = the subsection the article appeared in (Education, Small Business, Room for Debate, etc.)
# Headline = the title of the article
# Snippet = a small portion of the article text
# Abstract = a summary of the blog article, written by the New York Times
# WordCount = the number of words in the article
# PubDate = the publication date, in the format "Year-Month-Day Hour:Minute:Second"
# UniqueID = a unique identifier for each article
# If multiple vars are parts of id, consider concatenating them to create one id var
# If glb_id_var == NULL, ".rownames <- row.names()" is the default
# User-specified exclusions
glbFeatsExclude <- c(NULL
# Feats that shd be excluded due to known causation by prediction variable
# , "<feat1", "<feat2>"
# Feats that are linear combinations (alias in glm)
# Feature-engineering phase -> start by excluding all features except id & category & work each one in
, "NewsDesk", "SectionName", "SubsectionName"
, "WordCount", "PubDate"
# Feature Engineering done with prior features
, "Headline", "Snippet", "Abstract"
)
if (glb_rsp_var_raw != glb_rsp_var)
glbFeatsExclude <- union(glbFeatsExclude, glb_rsp_var_raw)
glbFeatsInteractionOnly <- list()
#glbFeatsInteractionOnly[["carrier.fctr"]] <- "cellular.fctr"
# currently does not handle more than 1 column; consider concatenating multiple columns
glb_id_var <- "UniqueID" # choose from c(NULL : default, "<id_feat>")
glbFeatsCategory <- "NDSSName.my.fctr" # choose from c(NULL : default, "<category>")
glb_drop_vars <- c(NULL
# , "<feat1>", "<feat2>"
)
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
glb_assign_pairs_lst <- NULL;
# glb_assign_pairs_lst[["<var1>"]] <- list(from=c(NA),
# to=c("NA.my"))
glb_assign_vars <- names(glb_assign_pairs_lst)
# Derived features; Use this mechanism to cleanse data ??? Cons: Data duplication ???
glbFeatsDerive <- list();
# glbFeatsDerive[["<feat.my.sfx>"]] <- list(
# mapfn = function(<arg1>, <arg2>) { return(function(<arg1>, <arg2>)) }
# , args = c("<arg1>", "<arg2>"))
# character
# mapfn = function(Week) { return(substr(Week, 1, 10)) }
# mapfn = function(descriptor) { return(plyr::revalue(descriptor, c(
# "ABANDONED BUILDING" = "OTHER",
# "**" = "**"
# ))) }
glbFeatsDerive[["NDSSName.my"]] <- list(
mapfn = function(NewsDesk, SectionName, SubsectionName) {
descriptor <-
gsub(" ", "", paste(NewsDesk, SectionName, SubsectionName, sep = "#"))
return(plyr::revalue(descriptor, c(NULL
, "#BusinessDay#Dealbook" = "Business#BusinessDay#Dealbook"
, "#BusinessDay#SmallBusiness" = "Business#BusinessDay#SmallBusiness"
, "#Crosswords/Games#" = "Business#Crosswords/Games#"
, "#Open#" = "Business#Technology#"
, "#Technology#" = "Business#Technology#"
, "Business##" = "Business#Technology#"
, "#Arts#" = "Culture#Arts#"
, "Foreign##" = "Foreign#World#"
, "#World#AsiaPacific" = "Foreign#World#AsiaPacific"
, "#N.Y./Region#" = "Metro#N.Y./Region#"
, "#Opinion#" = "OpEd#Opinion#"
, "OpEd##" = "OpEd#Opinion#"
, "#Health#" = "Science#Health#"
, "Science##" = "Science#Health#"
, "Styles#Health#" = "Science#Health#"
, "Styles##" = "Styles##Fashion"
, "Styles#Style#Fashion&Style" = "Styles##Fashion"
, "#Travel#" = "Travel#Travel#"
, "Magazine#Magazine#" = "myOther"
, "National##" = "myOther"
, "National#U.S.#Politics" = "myOther"
, "Sports##" = "myOther"
, "Sports#Sports#" = "myOther"
, "#U.S.#" = "myOther"
)))
}
, args = c("NewsDesk", "SectionName", "SubsectionName"))
# mapfn = function(description) { mod_raw <- description;
# This is here because it does not work if it's in txt_map_filename
# mod_raw <- gsub(paste0(c("\n", "\211", "\235", "\317", "\333"), collapse = "|"), " ", mod_raw)
# Don't parse for "." because of ".com"; use customized gsub for that text
# mod_raw <- gsub("(\\w)(!|\\*|,|-|/)(\\w)", "\\1\\2 \\3", mod_raw);
# return(mod_raw) }
#print(mod_raw <- grep(""", glbObsAll[, txt_var], value = TRUE))
#print(mod_raw <- glbObsAll[c(88,187,280,1040,1098), txt_var])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="\\bdoes( +)not\\b")), glbFeatsText])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="\\bipad [[:digit:]]\\b")), glbFeatsText][01:10])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][11:20])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][21:30])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][31:40])
#glbObsAll[which(glb_post_stop_words_terms_mtrx_lst[[txt_var]][, subset(glb_post_stop_words_terms_df_lst[[txt_var]], term %in% c("conditionminimal"))$pos] > 0), "description"]
# numeric
# Create feature based on record position/id in data
# glbFeatsDerive[["dummy.my"]] <- list(
# mapfn = function(UniqueID) { return(UniqueID) }
# , args = c("UniqueID"))
# Add logs of numerics that are not distributed normally
# Derive & keep multiple transformations of the same feature, if normality is hard to achieve with just one transformation
# Right skew: logp1; sqrt; ^ 1/3; logp1(logp1); log10; exp(-<feat>/constant)
glbFeatsDerive[["WordCount.log1p"]] <- list(
mapfn = function(WordCount) { return(log1p(WordCount)) }
, args = c("WordCount"))
glbFeatsDerive[["WordCount.root2"]] <- list(
mapfn = function(WordCount) { return(WordCount ^ (1/2)) }
, args = c("WordCount"))
glbFeatsDerive[["WordCount.nexp"]] <- list(
mapfn = function(WordCount) { return(exp(-WordCount)) }
, args = c("WordCount"))
#print(summary(glbObsAll$WordCount))
#print(summary(mapfn(glbObsAll$WordCount)))
# mapfn = function(Rasmussen) { return(ifelse(sign(Rasmussen) >= 0, 1, 0)) }
# mapfn = function(startprice) { return(startprice ^ (1/2)) }
# mapfn = function(startprice) { return(log(startprice)) }
# mapfn = function(startprice) { return(exp(-startprice / 20)) }
# mapfn = function(startprice) { return(scale(log(startprice))) }
# mapfn = function(startprice) { return(sign(sprice.predict.diff) * (abs(sprice.predict.diff) ^ (1/10))) }
# factor
# mapfn = function(PropR) { return(as.factor(ifelse(PropR >= 0.5, "Y", "N"))) }
# mapfn = function(productline, description) { as.factor(gsub(" ", "", productline)) }
# mapfn = function(purpose) { return(relevel(as.factor(purpose), ref="all_other")) }
# mapfn = function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# mapfn = function(startprice.log10) { return(cut(startprice.log10, 3)) }
# mapfn = function(startprice.log10) { return(cut(sprice.predict.diff, c(-1000, -100, -10, -1, 0, 1, 10, 100, 1000))) }
# , args = c("<arg1>"))
# multiple args
# mapfn = function(PTS, oppPTS) { return(PTS - oppPTS) }
# mapfn = function(startprice.log10.predict, startprice) {
# return(spdiff <- (10 ^ startprice.log10.predict) - startprice) }
# mapfn = function(productline, description) { as.factor(
# paste(gsub(" ", "", productline), as.numeric(nchar(description) > 0), sep = "*")) }
# # If glbObsAll is not sorted in the desired manner
# mapfn=function(Week) { return(coredata(lag(zoo(orderBy(~Week, glbObsAll)$ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI) { return(coredata(lag(zoo(ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI.2.lag) { return(log(ILI.2.lag)) }
# glbFeatsDerive[["<var1>"]] <- glbFeatsDerive[["<var2>"]]
glb_derive_vars <- names(glbFeatsDerive)
# tst <- "descr.my"; args_lst <- NULL; for (arg in glbFeatsDerive[[tst]]$args) args_lst[[arg]] <- glbObsAll[, arg]; print(head(args_lst[[arg]])); print(head(drv_vals <- do.call(glbFeatsDerive[[tst]]$mapfn, args_lst)));
# print(which_ix <- which(args_lst[[arg]] == 0.75)); print(drv_vals[which_ix]);
glbFeatsDateTime <- list()
glbFeatsDateTime[["PubDate"]] <-
c(format = "%Y-%m-%d %H:%M:%S", timezone = "America/New_York", impute.na = FALSE)
glbFeatsPrice <- NULL # or c("<price_var>")
glbFeatsText <- NULL # c("<txt_var>") # NULL #
Sys.setlocale("LC_ALL", "C") # For english
## [1] "C/C/C/C/C/en_US.UTF-8"
# Text Processing Step: custom modifications not present in txt_munge -> use glbFeatsDerive
# Text Processing Step: universal modifications
glb_txt_munge_filenames_pfx <- "NYTBlogs3_mytxt_"
# Text Processing Step: tolower
# Text Processing Step: myreplacePunctuation
# Text Processing Step: removeWords
glb_txt_stop_words <- list()
# Remember to use unstemmed words
if (!is.null(glbFeatsText)) {
require(tm)
glb_txt_stop_words[["<txt_var>"]] <- sort(c(NULL
# Remove any words from stopwords
# , setdiff(myreplacePunctuation(stopwords("english")), c("<keep_wrd1>", <keep_wrd2>"))
# cor.y.train == NA
# ,unlist(strsplit(paste(c(NULL
# ,"<comma-separated-terms>"
# ), collapse=",")
# freq == 1; keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# chisq.pval high (e.g. == 1); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# nzv.freqRatio high (e.g. >= glb_nzv_freqCut); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
))
}
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^2", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txt_var]][, 6] > 0, glbFeatsText]
# To identify terms with a specific freq
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], freq == 1)$term), collapse = ",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], freq <= 2)$term), collapse = ",")
# To identify terms with a specific freq &
# are not stemmed together later OR is value of color.fctr (e.g. gold)
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], (freq == 1) & !(term %in% c("blacked","blemish","blocked","blocks","buying","cables","careful","carefully","changed","changing","chargers","cleanly","cleared","connect","connects","connected","contains","cosmetics","default","defaulting","defective","definitely","describe","described","devices","displays","drop","drops","engravement","excellant","excellently","feels","fix","flawlessly","frame","framing","gentle","gold","guarantee","guarantees","handled","handling","having","install","iphone","iphones","keeped","keeps","known","lights","line","lining","liquid","liquidation","looking","lots","manuals","manufacture","minis","most","mostly","network","networks","noted","opening","operated","performance","performs","person","personalized","photograph","physically","placed","places","powering","pre","previously","products","protection","purchasing","returned","rotate","rotation","running","sales","second","seconds","shipped","shuts","sides","skin","skinned","sticker","storing","thats","theres","touching","unusable","update","updates","upgrade","weeks","wrapped","verified","verify") ))$term), collapse = ",")
#print(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (freq <= 2)))
#glbObsAll[which(terms_mtrx[, 229] > 0), glbFeatsText]
# To identify terms with cor.y == NA
#orderBy(~-freq+term, subset(glb_post_stop_words_terms_df_lst[[txt_var]], is.na(cor.y)))
#paste(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], is.na(cor.y))[, "term"]), collapse=",")
#orderBy(~-freq+term, subset(glb_post_stem_words_terms_df_lst[[txt_var]], is.na(cor.y)))
# To identify terms with low cor.y.abs
#head(orderBy(~cor.y.abs+freq+term, subset(glb_post_stem_words_terms_df_lst[[txt_var]], !is.na(cor.y))), 5)
# To identify terms with high chisq.pval
#subset(glb_post_stem_words_terms_df_lst[[txt_var]], chisq.pval > 0.99)
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (chisq.pval > 0.99) & (freq <= 10))$term), collapse=",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (chisq.pval > 0.9))$term), collapse=",")
#head(orderBy(~-chisq.pval+freq+term, glb_post_stem_words_terms_df_lst[[txt_var]]), 5)
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txt_var]][, 68] > 0, glbFeatsText]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^m", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
# To identify terms with high nzv.freqRatio
#summary(glb_post_stem_words_terms_df_lst[[txt_var]]$nzv.freqRatio)
#paste0(sort(setdiff(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (nzv.freqRatio >= glb_nzv_freqCut) & (freq < 10) & (chisq.pval >= 0.05))$term, c( "128gb","3g","4g","gold","ipad1","ipad3","ipad4","ipadair2","ipadmini2","manufactur","spacegray","sprint","tmobil","verizon","wifion"))), collapse=",")
# To identify obs with a txt term
#tail(orderBy(~-freq+term, glb_post_stop_words_terms_df_lst[[txt_var]]), 20)
#mydspObs(list(descr.my.contains="non"), cols=c("color", "carrier", "cellular", "storage"))
#grep("ever", dimnames(terms_stop_mtrx)$Terms)
#which(terms_stop_mtrx[, grep("ipad", dimnames(terms_stop_mtrx)$Terms)] > 0)
#glbObsAll[which(terms_stop_mtrx[, grep("16", dimnames(terms_stop_mtrx)$Terms)[1]] > 0), c(glbFeatsCategory, "storage", txt_var)]
# To identify whether terms shd be synonyms
#orderBy(~term, glb_post_stop_words_terms_df_lst[[txt_var]][grep("^moder", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ])
# term_row_df <- glb_post_stop_words_terms_df_lst[[txt_var]][grep("^came$", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ]
#
# cor(glb_post_stop_words_terms_mtrx_lst[[txt_var]][glbObsAll$.lcn == "Fit", term_row_df$pos], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# To identify which stopped words are "close" to a txt term
#sort(cluster_vars)
# Text Processing Step: stemDocument
# To identify stemmed txt terms
#glb_post_stop_words_terms_df_lst[[txt_var]][grep("condit", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^con", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
#glbObsAll[which(terms_stem_mtrx[, grep("use", dimnames(terms_stem_mtrx)$Terms)[[1]]] > 0), c(glb_id_var, "productline", txt_var)]
#glbObsAll[which(TfIdf_stem_mtrx[, 191] > 0), c(glb_id_var, glbFeatsCategory, txt_var)]
#which(glbObsAll$UniqueID %in% c(11915, 11926, 12198))
# Text Processing Step: mycombineSynonyms
# To identify which terms are associated with not -> combine "could not" & "couldn't"
#findAssocs(glb_full_DTM_lst[[txt_var]], "not", 0.05)
# To identify which synonyms should be combined
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^c", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
chk_comb_cor <- function(syn_lst) {
# cor(terms_stem_mtrx[glbObsAll$.src == "Train", grep("^(damag|dent|ding)$", dimnames(terms_stem_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
print(subset(glb_post_stem_words_terms_df_lst[[txt_var]], term %in% syn_lst$syns))
print(subset(get_corpus_terms(tm_map(glb_txt_corpus_lst[[txt_var]], mycombineSynonyms, list(syn_lst), lazy=FALSE)), term == syn_lst$word))
# cor(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# cor(rowSums(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])]), glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
}
#chk_comb_cor(syn_lst=list(word="cabl", syns=c("cabl", "cord")))
#chk_comb_cor(syn_lst=list(word="damag", syns=c("damag", "dent", "ding")))
#chk_comb_cor(syn_lst=list(word="dent", syns=c("dent", "ding")))
#chk_comb_cor(syn_lst=list(word="use", syns=c("use", "usag")))
glb_txt_synonyms <- list()
#glb_txt_synonyms[["<txt_var>"]] <- list(NULL
# , list(word="<stem1>", syns=c("<stem1>", "<stem1_2>"))
# )
# options include: "weightTf", "myweightTflog1p", "myweightTfsqrt", "weightTfIdf", "weightBM25"
glb_txt_terms_control <- list(weighting = "weightTfIdf" # : default
# termFreq selection criteria across obs: tm default: list(global=c(1, Inf))
, bounds = list(global = c(1, Inf))
# wordLengths selection criteria: tm default: c(3, Inf)
, wordLengths = c(1, Inf)
)
glb_txt_cor_var <- glb_rsp_var # : default # or c(<feat>)
# select one from c("union.top.val.cor", "top.cor", "top.val", default: "top.chisq", "sparse")
glbFeatsTextFilter <- "top.chisq"
glbFeatsTextTermsMax <- rep(10, length(glbFeatsText)) # :default
names(glbFeatsTextTermsMax) <- glbFeatsText
# Text Processing Step: extractAssoc
glbFeatsTextAssocCor <- rep(1, length(glbFeatsText)) # :default
names(glbFeatsTextAssocCor) <- glbFeatsText
# Remember to use stemmed terms
glb_important_terms <- list()
# Text Processing Step: extractPatterns (ngrams)
glbFeatsTextPatterns <- list()
#glbFeatsTextPatterns[[<txt_var>>]] <- list()
#glbFeatsTextPatterns[[<txt_var>>]] <- c(metropolitan.diary.colon = "Metropolitan Diary:")
# Have to set it even if it is not used
# Properties:
# numrows(glb_feats_df) << numrows(glbObsFit
# Select terms that appear in at least 0.2 * O(FP/FN(glbObsOOB)) ???
# numrows(glbObsOOB) = 1.1 * numrows(glbObsNew) ???
glb_sprs_thresholds <- NULL # or c(<txt_var1> = 0.988, <txt_var2> = 0.970, <txt_var3> = 0.970)
glbFctrMaxUniqVals <- 21 # default: 20
glb_impute_na_data <- TRUE # FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- FALSE # : default or TRUE
glb_cluster.seed <- 189 # or any integer
glb_cluster_entropy_var <- glb_rsp_var # c(glb_rsp_var, as.factor(cut(glb_rsp_var, 3)), default: NULL)
glbFeatsTextClusterVarsExclude <- FALSE # default FALSE
glb_interaction_only_feats <- NULL # : default or c(<parent_feat> = "<child_feat>")
glb_nzv_freqCut <- 19 # 19 : caret default
glb_nzv_uniqueCut <- 10 # 10 : caret default
glbRFESizes <- list()
#glbRFESizes[["mdlFamily"]] <- c(4, 8, 16, 32, 64, 67, 68, 69) # Accuracy@69/70 = 0.8258
glbObsFitOutliers <- list()
# If outliers.n >= 10; consider concatenation of interaction vars
# glbObsFitOutliers[["<mdlFamily>"]] <- c(NULL
# is.na(.rstudent)
# is.na(.dffits)
# .hatvalues >= 0.99
# -38,167,642 < minmax(.rstudent) < 49,649,823
# , <comma-separated-<glb_id_var>>
# )
glbObsTrnOutliers <- list()
# influence.measures: car::outlier; rstudent; dffits; hatvalues; dfbeta; dfbetas
#mdlId <- "RFE.X.glm"; obs_df <- fitobs_df
#mdlId <- "Final.glm"; obs_df <- trnobs_df
#mdlId <- "CSM2.X.glm"; obs_df <- fitobs_df
#print(outliers <- car::outlierTest(glb_models_lst[[mdlId]]$finalModel))
#mdlIdFamily <- paste0(head(unlist(str_split(mdlId, "\\.")), -1), collapse="."); obs_df <- dplyr::filter_(obs_df, interp(~(!(var %in% glbObsFitOutliers[[mdlIdFamily]])), var = as.name(glb_id_var))); model_diags_df <- cbind(obs_df, data.frame(.rstudent=stats::rstudent(glb_models_lst[[mdlId]]$finalModel)), data.frame(.dffits=stats::dffits(glb_models_lst[[mdlId]]$finalModel)), data.frame(.hatvalues=stats::hatvalues(glb_models_lst[[mdlId]]$finalModel)));print(summary(model_diags_df[, c(".rstudent",".dffits",".hatvalues")])); table(cut(model_diags_df$.hatvalues, breaks=c(0.00, 0.98, 0.99, 1.00)))
#print(subset(model_diags_df, is.na(.rstudent))[, glb_id_var])
#print(subset(model_diags_df, is.na(.dffits))[, glb_id_var])
#print(model_diags_df[which.min(model_diags_df$.dffits), ])
#print(subset(model_diags_df, .hatvalues > 0.99)[, glb_id_var])
#dffits_df <- merge(dffits_df, outliers_df, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#dffits_df <- merge(dffits_df, glbObsFit, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#subset(dffits_df, !is.na(.Bonf.p))
#mdlId <- "CSM.X.glm"; vars <- myextract_actual_feats(row.names(orderBy(reformulate(c("-", paste0(mdlId, ".imp"))), myget_feats_imp(glb_models_lst[[mdlId]]))));
#model_diags_df <- glb_get_predictions(model_diags_df, mdlId, glb_rsp_var)
#obs_ix <- row.names(model_diags_df) %in% names(outliers$rstudent)[1]
#obs_ix <- which(is.na(model_diags_df$.rstudent))
#obs_ix <- which(is.na(model_diags_df$.dffits))
#myplot_parcoord(obs_df=model_diags_df[, c(glb_id_var, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, paste0(glb_rsp_var, mdlId), vars[1:min(20, length(vars))])], obs_ix=obs_ix, id_var=glb_id_var, category_var=glbFeatsCategory)
#model_diags_df[row.names(model_diags_df) %in% names(outliers$rstudent)[c(1:2)], ]
#ctgry_diags_df <- model_diags_df[model_diags_df[, glbFeatsCategory] %in% c("Unknown#0"), ]
#myplot_parcoord(obs_df=ctgry_diags_df[, c(glb_id_var, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:20])], obs_ix=row.names(ctgry_diags_df) %in% names(outliers$rstudent)[1], id_var=glb_id_var, category_var=glbFeatsCategory)
#table(glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), "startprice.log10.cut.fctr"])
#glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), c(glb_id_var, "startprice")]
# No outliers & .dffits == NaN
#myplot_parcoord(obs_df=model_diags_df[, c(glb_id_var, glbFeatsCategory, glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:10])], obs_ix=seq(1:nrow(model_diags_df))[is.na(model_diags_df$.dffits)], id_var=glb_id_var, category_var=glbFeatsCategory)
# Modify mdlId to (build & extract) "<FamilyId>#<Fit|Trn>#<caretMethod>#<preProc1.preProc2>#<samplingMethod>"
glb_models_lst <- list(); glb_models_df <- data.frame()
# Regression
if (glb_is_regression) {
glbMdlMethods <- c(NULL
# deterministic
#, "lm", # same as glm
, "glm", "bayesglm", "glmnet"
, "rpart"
# non-deterministic
, "gbm", "rf"
# Unknown
, "nnet" , "avNNet" # runs 25 models per cv sample for tunelength=5
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
, "bagEarth" # Takes a long time
)
} else
# Classification - Add ada (auto feature selection)
if (glb_is_binomial)
glbMdlMethods <- c(NULL
# deterministic
, "bagEarth" # Takes a long time
, "glm", "bayesglm", "glmnet"
, "nnet"
, "rpart"
# non-deterministic
, "gbm"
, "avNNet" # runs 25 models per cv sample for tunelength=5
, "rf"
# Unknown
, "lda", "lda2"
# svm models crash when predict is called -> internal to kernlab it should call predict without .outcome
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
) else
glbMdlMethods <- c(NULL
# non-deterministic
, "rf"
# Unknown
, "gbm", "rpart"
)
glb_mdl_family_lst <- list(); glb_mdl_feats_lst <- list()
# family: Choose from c("RFE.X", "CSM.X", "All.X", "Best.Interact")
# methods: Choose from c(NULL, <method>, glbMdlMethods)
#glb_mdl_family_lst[["RFE.X"]] <- c("glmnet", "glm") # non-NULL list is mandatory
glb_mdl_family_lst[["All.X"]] <- "glmnet" # non-NULL list is mandatory
#glb_mdl_family_lst[["Best.Interact"]] <- "glmnet" # non-NULL list is mandatory
# Check if interaction features make RFE better
# glb_mdl_family_lst[["CSM.X"]] <- setdiff(glbMdlMethods, c("lda", "lda2")) # crashing due to category:.clusterid ??? #c("glmnet", "glm") # non-NULL list is mandatory
# glb_mdl_feats_lst[["CSM.X"]] <- c(NULL
# , <comma-separated-features-vector>
# )
# dAFeats.CSM.X %<d-% c(NULL
# # Interaction feats up to varImp(RFE.X.glmnet) >= 50
# , <comma-separated-features-vector>
# , setdiff(myextract_actual_feats(predictors(rfe_fit_results)), c(NULL
# , <comma-separated-features-vector>
# ))
# )
# glb_mdl_feats_lst[["CSM.X"]] <- "%<d-% dAFeats.CSM.X"
# Check if tuning parameters make fit better; make it mdlFamily customizable ?
glb_tune_models_df <- data.frame()
# Experiment specific code to avoid caret crash
glmnet_tune_models_df <- rbind(data.frame()
,data.frame(method = "glmnet", parameter = "alpha",
vals = "0.100 0.325 0.550 0.775 1.000")
,data.frame(method = "glmnet", parameter = "lambda",
vals = "9.342e-02")
)
#avNNet
# size=[1] 3 5 7 9; decay=[0] 1e-04 0.001 0.01 0.1; bag=[FALSE]; RMSE=1.3300906
#bagEarth
# degree=1 [2] 3; nprune=64 128 256 512 [1024]; RMSE=0.6486663 (up)
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "bagEarth", parameter = "nprune", vals = "256")
# ,data.frame(method = "bagEarth", parameter = "degree", vals = "2")
# ))
#earth
# degree=[1]; nprune=2 [9] 17 25 33; RMSE=0.1334478
#gbm
# shrinkage=0.05 [0.10] 0.15 0.20 0.25; n.trees=100 150 200 [250] 300; interaction.depth=[1] 2 3 4 5; n.minobsinnode=[10]; RMSE=0.2008313
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "gbm", parameter = "shrinkage", min = 0.05, max = 0.25, by = 0.05)
# ,data.frame(method = "gbm", parameter = "n.trees", min = 100, max = 300, by = 50)
# ,data.frame(method = "gbm", parameter = "interaction.depth", min = 1, max = 5, by = 1)
# ,data.frame(method = "gbm", parameter = "n.minobsinnode", min = 10, max = 10, by = 10)
# #seq(from=0.05, to=0.25, by=0.05)
# ))
#glmnet
# alpha=0.100 [0.325] 0.550 0.775 1.000; lambda=0.0005232693 0.0024288010 0.0112734954 [0.0523269304] 0.2428800957; RMSE=0.6164891
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha", vals = "0.550 0.775 0.8875 0.94375 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda", vals = "9.858855e-05 0.0001971771 0.0009152152 0.0042480525 0.0197177130")
# ))
#nnet
# size=3 5 [7] 9 11; decay=0.0001 0.001 0.01 [0.1] 0.2; RMSE=0.9287422
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "nnet", parameter = "size", vals = "3 5 7 9 11")
# ,data.frame(method = "nnet", parameter = "decay", vals = "0.0001 0.0010 0.0100 0.1000 0.2000")
# ))
#rf # Don't bother; results are not deterministic
# mtry=2 35 68 [101] 134; RMSE=0.1339974
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "rf", parameter = "mtry", vals = "2 5 9 13 17")
# ))
#rpart
# cp=0.020 [0.025] 0.030 0.035 0.040; RMSE=0.1770237
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "rpart", parameter = "cp", vals = "0.004347826 0.008695652 0.017391304 0.021739130 0.034782609")
# ))
#svmLinear
# C=0.01 0.05 [0.10] 0.50 1.00 2.00 3.00 4.00; RMSE=0.1271318; 0.1296718
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "svmLinear", parameter = "C", vals = "0.01 0.05 0.1 0.5 1")
# ))
#svmLinear2
# cost=0.0625 0.1250 [0.25] 0.50 1.00; RMSE=0.1276354
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "svmLinear2", parameter = "cost", vals = "0.0625 0.125 0.25 0.5 1")
# ))
#svmPoly
# degree=[1] 2 3 4 5; scale=0.01 0.05 [0.1] 0.5 1; C=0.50 1.00 [2.00] 3.00 4.00; RMSE=0.1276130
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method="svmPoly", parameter="degree", min=1, max=5, by=1) #seq(1, 5, 1)
# ,data.frame(method="svmPoly", parameter="scale", vals="0.01, 0.05, 0.1, 0.5, 1")
# ,data.frame(method="svmPoly", parameter="C", vals="0.50, 1.00, 2.00, 3.00, 4.00")
# ))
#svmRadial
# sigma=[0.08674323]; C=0.25 0.50 1.00 [2.00] 4.00; RMSE=0.1614957
#glb2Sav(); all.equal(sav_models_df, glb_models_df)
glb_preproc_methods <- NULL
# c("YeoJohnson", "center.scale", "range", "pca", "ica", "spatialSign")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<feat>")
glbMdlMetric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glbMdlMetricSummary <- NULL # or "<metric_name>"
glbMdlMetricMaximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glbMdlMetricSummaryFn <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glbMdlMetric_terms)
# metric <- sum(confusion_mtrx * glbMdlMetric_terms) / nrow(data)
# names(metric) <- glbMdlMetricSummary
# return(metric)
# }
glb_rcv_n_folds <- 3 # or NULL
glb_rcv_n_repeats <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glbMdlMetricsEval <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit", "min.RMSE.fit")
#glbMdlMetricsEval <- c("min.RMSE.fit", "max.R.sq.fit", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glbMdlMetricsEval <-
c("max.Accuracy.OOB", "max.AUCROCR.OOB", "max.AUCpROC.OOB", "min.aic.fit", "max.Accuracy.fit") else
glbMdlMetricsEval <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
# select from NULL [no ensemble models], "auto" [all models better than MFO or Baseline], c(mdl_ids in glb_models_lst) [Typically top-rated models in auto]
glb_mdl_ensemble <- NULL
# "%<d-% setdiff(mygetEnsembleAutoMdlIds(), 'CSM.X.rf')"
# c(<comma-separated-mdlIds>
# )
# Only for classifications; for regressions remove "(.*)\\.prob" form the regex
# tmp_fitobs_df <- glbObsFit[, grep(paste0("^", gsub(".", "\\.", mygetPredictIds$value, fixed = TRUE), "CSM\\.X\\.(.*)\\.prob"), names(glbObsFit), value = TRUE)]; cor_mtrx <- cor(tmp_fitobs_df); cor_vctr <- sort(cor_mtrx[row.names(orderBy(~-Overall, varImp(glb_models_lst[["Ensemble.repeatedcv.glmnet"]])$imp))[1], ]); summary(cor_vctr); cor_vctr
#ntv.glm <- glm(reformulate(indep_vars, glb_rsp_var), family = "binomial", data = glbObsFit)
#step.glm <- step(ntv.glm)
glb_sel_mdl_id <- "All.X##rcv#glmnet" #select from c(NULL, "All.X##rcv#glmnet", "RFE.X##rcv#glmnet", <mdlId>)
glb_fin_mdl_id <- NULL #select from c(NULL, glb_sel_mdl_id)
glb_dsp_cols <- c(glb_id_var, glbFeatsCategory, glb_rsp_var
# List critical cols excl. glb_id_var, glbFeatsCategory & glb_rsp_var
)
# Output specs
glbOutDataVizFname <- "NYTBlogs3_obsall.csv" # choose from c(NULL, "NYTBlogs3_obsall.csv")
glb_out_obs <- NULL # select from c(NULL : default to "new", "all", "new", "trn")
glb_out_vars_lst <- list()
# glb_id_var will be the first output column, by default
glb_out_vars_lst[["Probability1"]] <-
"%<d-% mygetPredictIds(glb_rsp_var, glb_fin_mdl_id)$prob"
# glb_out_vars_lst[[glb_rsp_var_raw]] <- glb_rsp_var_raw
# glb_out_vars_lst[[paste0(head(unlist(strsplit(mygetPredictIds$value, "")), -1), collapse = "")]] <-
glbOutStackFnames <- NULL #: default
# c("ebayipads_txt_assoc1_out_bid1_stack.csv") # manual stack
# c("ebayipads_finmdl_bid1_out_nnet_1.csv") # universal stack
glb_out_pfx <- "NYTBlogs3_feat_PubDate_"
glb_save_envir <- FALSE # or TRUE
# Depict process
glb_analytics_pn <- petrinet(name = "glb_analytics_pn",
trans_df = data.frame(id = 1:6,
name = c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 9.601 NA NA
1.0: import data## [1] "Reading file ./data/NYTimesBlogTrain.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTrain.csv: 6,532 rows x 10 cols"
## NewsDesk SectionName SubsectionName
## 1 Business Crosswords/Games
## 2 Culture Arts
## 3 Business Business Day Dealbook
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 More School Daze
## 2 New 96-Page Murakami Work Coming in December
## 3 Public Pension Funds Stay Mum on Corporate Expats
## 4 Boot Camp for Bankers
## 5 Of Little Help to Older Knees
## 6 A Benefit of Legal Marijuana
## Snippet
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## Abstract
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## WordCount PubDate Popular UniqueID
## 1 508 2014-09-01 22:00:09 1 1
## 2 285 2014-09-01 21:14:07 0 2
## 3 1211 2014-09-01 21:05:36 0 3
## 4 1405 2014-09-01 20:43:34 1 4
## 5 181 2014-09-01 18:58:51 1 5
## 6 245 2014-09-01 18:52:22 1 6
## NewsDesk SectionName SubsectionName
## 226 Styles
## 995
## 3327
## 4753 Multimedia
## 4802 Business Crosswords/Games
## 6463 TStyle
## Headline
## 226 For Tavi Gevinson, Fashion Takes a Back Seat, for Now
## 995 Reconsidering What to Call an Extremist Group
## 3327 Clinton's Diagnosis of What's Wrong With Politics
## 4753 'Off Color' and on Target About Race in America
## 4802 Daniel Finkel's Circle-Toss Game
## 6463 Entering the Void
## Snippet
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 3327 Hillary Rodham Clinton continued to laugh off questions about her presidential aspirations on Tuesday, but she did shed some light on what she thinks is wrong in Washington.
## 4753 Off Color, a New York Times video series, looks at how artists of color are making sharp social commentary about race in America through comedy and performance.
## 4802 By math educator Daniel Finkel, a puzzle thats childs play. Can you figure it out?
## 6463 The Spanish artist Miquel Barcel closely examines the basic materials of life in response to Edward Hirsch questioning his own belief in a higher power.
## Abstract
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 3327 Hillary Rodham Clinton continued to laugh off questions about her presidential aspirations on Tuesday, but she did shed some light on what she thinks is wrong in Washington.
## 4753 Off Color, a New York Times video series, looks at how artists of color are making sharp social commentary about race in America through comedy and performance.
## 4802 By math educator Daniel Finkel, a puzzle thats childs play. Can you figure it out?
## 6463 The Spanish artist Miquel Barcel closely examines the basic materials of life in response to Edward Hirsch questioning his own belief in a higher power.
## WordCount PubDate Popular UniqueID
## 226 459 2014-09-04 16:55:57 0 226
## 995 301 2014-09-15 16:05:13 0 995
## 3327 236 2014-10-14 14:45:51 0 3327
## 4753 393 2014-11-02 05:00:13 0 4753
## 4802 1628 2014-11-03 12:00:04 1 4802
## 6463 264 2014-11-27 12:00:09 0 6463
## NewsDesk SectionName SubsectionName
## 6527 Foreign
## 6528 Opinion Room For Debate
## 6529 Foreign
## 6530 TStyle
## 6531 Multimedia
## 6532 Business
## Headline
## 6527 1914: Russians Dominate in East Poland
## 6528 Finding a Secretary of Defense
## 6529 1889: Metropolitan Opera House Reopens in New York
## 6530 The Daily Gift: Picasso Plates for Creative Dining
## 6531 Racing From New York to Barcelona
## 6532 Math Anxiety: Why Hollywood Makes Robots of Alan Turing and Other Geniuses
## Snippet
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## Abstract
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## WordCount PubDate Popular UniqueID
## 6527 176 2014-11-30 13:48:40 0 6527
## 6528 1597 2014-11-30 13:27:23 0 6528
## 6529 214 2014-11-30 09:44:57 0 6529
## 6530 61 2014-11-30 09:00:43 0 6530
## 6531 441 2014-11-30 09:00:22 0 6531
## 6532 921 2014-11-30 07:00:40 0 6532
## 'data.frame': 6532 obs. of 10 variables:
## $ NewsDesk : chr "Business" "Culture" "Business" "Business" ...
## $ SectionName : chr "Crosswords/Games" "Arts" "Business Day" "Business Day" ...
## $ SubsectionName: chr "" "" "Dealbook" "Dealbook" ...
## $ Headline : chr "More School Daze" "New 96-Page Murakami Work Coming in December" "Public Pension Funds Stay Mum on Corporate Expats" "Boot Camp for Bankers" ...
## $ Snippet : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ Abstract : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ WordCount : int 508 285 1211 1405 181 245 258 893 1077 188 ...
## $ PubDate : chr "2014-09-01 22:00:09" "2014-09-01 21:14:07" "2014-09-01 21:05:36" "2014-09-01 20:43:34" ...
## $ Popular : int 1 0 0 1 1 1 0 1 1 0 ...
## $ UniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "comment")= chr "glbObsTrn"
## NULL
## [1] "Reading file ./data/NYTimesBlogTest.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTest.csv: 1,870 rows x 9 cols"
## NewsDesk SectionName SubsectionName
## 1 Culture
## 2 Culture Arts
## 3 Business Crosswords/Games
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 'Birdman' Tops the Gothams
## 2 'Sleepy Hollow' Recap: A Not-So-Shocking Death
## 3 Drinking Buddy For Falstaff
## 4 Encouraging Public Service, Through Wall Street's 'Revolving Door'
## 5 Therapy Prevents Repeat Suicide Attempts
## 6 Hoping for a Good Death
## Snippet
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## Abstract
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## WordCount PubDate UniqueID
## 1 111 2014-12-01 22:45:24 6533
## 2 558 2014-12-01 22:01:34 6534
## 3 788 2014-12-01 22:00:26 6535
## 4 915 2014-12-01 21:04:13 6536
## 5 213 2014-12-01 19:13:20 6537
## 6 938 2014-12-01 19:05:12 6538
## NewsDesk SectionName SubsectionName
## 3 Business Crosswords/Games
## 334 OpEd Opinion
## 725 TStyle
## 732 Business Business Day Dealbook
## 752 Business Business Day Dealbook
## 864
## Headline
## 3 Drinking Buddy For Falstaff
## 334 Facts & Figures: America’s Unique Take on Maternity Leave
## 725 Ansel Elgort Buttons Up in Brioni
## 732 A Shake-Up as the Financial World Infiltrates Philanthropy
## 752 Coupang, a South Korean E-Commerce Site, Raises $300 Million
## 864 Today in Politics
## Snippet
## 3 In which Timothy Polin reveals his potty mouth.
## 334 In the U.S., paid parental leave is more of a perk than a guarantee.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 732 Donor-advised funds help investors get deductions for charitable donations in one year, but society doesnt get the benefit of the money right away.
## 752 The latest financing round underscores Coupangs maturity and its ambitions to one day be a publicly traded company.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## Abstract
## 3 In which Timothy Polin reveals his potty mouth.
## 334 In the U.S., paid parental leave is more of a perk than a guarantee.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 732 Donor-advised funds help investors get deductions for charitable donations in one year, but society doesnt get the benefit of the money right away.
## 752 The latest financing round underscores Coupangs maturity and its ambitions to one day be a publicly traded company.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## WordCount PubDate UniqueID
## 3 788 2014-12-01 22:00:26 6535
## 334 160 2014-12-04 11:45:20 6866
## 725 89 2014-12-10 12:30:47 7257
## 732 1172 2014-12-10 12:00:38 7264
## 752 353 2014-12-10 08:30:41 7284
## 864 1544 2014-12-11 07:09:25 7396
## NewsDesk SectionName SubsectionName
## 1865
## 1866 Business Technology
## 1867 Metro N.Y. / Region
## 1868 Multimedia
## 1869 Foreign World Asia Pacific
## 1870 Science Health
## Headline
## 1865 Today in Politics
## 1866 Uber Suspends Operations in Spain
## 1867 New York Today: The Year in News
## 1868 New Year, Old Memories, in Times Square
## 1869 Hong Kong Police Criticized After 14-Year-Old's Detention
## 1870 The Super-Short Workout and Other Fitness Trends
## Snippet
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## Abstract
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## WordCount PubDate UniqueID
## 1865 1616 2014-12-31 07:03:46 8397
## 1866 292 2014-12-31 06:09:32 8398
## 1867 1010 2014-12-31 06:06:58 8399
## 1868 387 2014-12-31 05:00:19 8400
## 1869 717 2014-12-31 04:16:29 8401
## 1870 818 2014-12-31 00:01:10 8402
## 'data.frame': 1870 obs. of 9 variables:
## $ NewsDesk : chr "Culture" "Culture" "Business" "Business" ...
## $ SectionName : chr "" "Arts" "Crosswords/Games" "Business Day" ...
## $ SubsectionName: chr "" "" "" "Dealbook" ...
## $ Headline : chr "'Birdman' Tops the Gothams" "'Sleepy Hollow' Recap: A Not-So-Shocking Death" "Drinking Buddy For Falstaff" "Encouraging Public Service, Through Wall Street's 'Revolving Door'" ...
## $ Snippet : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ Abstract : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ WordCount : int 111 558 788 915 213 938 1336 2644 752 99 ...
## $ PubDate : chr "2014-12-01 22:45:24" "2014-12-01 22:01:34" "2014-12-01 22:00:26" "2014-12-01 21:04:13" ...
## $ UniqueID : int 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 ...
## - attr(*, "comment")= chr "glbObsNew"
## NULL
## [1] "Partition stats:"
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## .src .n
## 1 Train 6532
## 2 Test 1870
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Loading required package: lazyeval
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
## [1] "Found 0 duplicates by all features:"
## NULL
## [1] "Partition stats:"
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## .src .n
## 1 Train 6532
## 2 Test 1870
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 9.601 23.864 14.263
## 2 inspect.data 2 0 0 23.864 NA NA
2.0: inspect data## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1870 rows containing non-finite values (stat_bin).
## Loading required package: reshape2
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
## [1] "numeric data missing in glbObsAll: "
## Popular
## 1870
## [1] "numeric data w/ 0s in glbObsAll: "
## WordCount Popular
## 109 5439
## [1] "numeric data w/ Infs in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ NaNs in glbObsAll: "
## named integer(0)
## [1] "string data missing in glbObsAll: "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
## Popular Popular.fctr .n
## 1 0 N 5439
## 2 NA <NA> 1870
## 3 1 Y 1093
## Warning: Removed 1 rows containing missing values (position_stack).
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## label step_major step_minor label_minor bgn end elapsed
## 2 inspect.data 2 0 0 23.864 26.893 3.029
## 3 scrub.data 2 1 1 26.894 NA NA
2.1: scrub data## [1] "numeric data missing in glbObsAll: "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in glbObsAll: "
## WordCount Popular
## 109 5439
## [1] "numeric data w/ Infs in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ NaNs in glbObsAll: "
## named integer(0)
## [1] "string data missing in glbObsAll: "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
## label step_major step_minor label_minor bgn end elapsed
## 3 scrub.data 2 1 1 26.894 27.723 0.83
## 4 transform.data 2 2 2 27.724 NA NA
2.2: transform data## [1] "Creating new feature: NDSSName.my..."
## [1] "Creating new feature: WordCount.log1p..."
## [1] "Creating new feature: WordCount.root2..."
## [1] "Creating new feature: WordCount.nexp..."
## label step_major step_minor label_minor bgn end elapsed
## 4 transform.data 2 2 2 27.724 28.037 0.313
## 5 extract.features 3 0 0 28.037 NA NA
3.0: extract features## label step_major step_minor label_minor bgn end
## 1 extract.features_bgn 1 0 0 28.094 NA
## elapsed
## 1 NA
## label step_major step_minor label_minor
## 1 extract.features_bgn 1 0 0
## 2 extract.features_factorize.str.vars 2 0 0
## bgn end elapsed
## 1 28.094 28.104 0.011
## 2 28.105 NA NA
## NewsDesk SectionName SubsectionName Headline
## "NewsDesk" "SectionName" "SubsectionName" "Headline"
## Snippet Abstract PubDate .src
## "Snippet" "Abstract" "PubDate" ".src"
## NDSSName.my
## "NDSSName.my"
## Warning: Creating factors of string variable: NDSSName.my: # of unique
## values: 21
## label step_major step_minor label_minor
## 2 extract.features_factorize.str.vars 2 0 0
## 3 extract.features_xtract.DateTime.vars 3 0 0
## bgn end elapsed
## 2 28.105 28.122 0.018
## 3 28.123 NA NA
## [1] "Extracting features from DateTime(s): PubDate"
## Loading required package: XML
## [1] "**********"
## [1] "Consider adding state & city holidays for glbFeatsDateTime: PubDate"
## [1] "**********"
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## [1] "Missing data for numerics:"
## PubDate.last32.log1p.ctg
## 30
## label step_major step_minor label_minor
## 3 extract.features_xtract.DateTime.vars 3 0 0
## 4 extract.features_end 4 0 0
## bgn end elapsed
## 3 28.123 33.669 5.546
## 4 33.669 NA NA
## label step_major step_minor label_minor
## 3 extract.features_xtract.DateTime.vars 3 0 0
## 2 extract.features_factorize.str.vars 2 0 0
## 1 extract.features_bgn 1 0 0
## bgn end elapsed duration
## 3 28.123 33.669 5.546 5.546
## 2 28.105 28.122 0.018 0.017
## 1 28.094 28.104 0.011 0.010
## [1] "Total Elapsed Time: 33.669 secs"
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## label step_major step_minor label_minor bgn end
## 5 extract.features 3 0 0 28.037 34.987
## 6 manage.missing.data 3 1 1 34.987 NA
## elapsed
## 5 6.95
## 6 NA
3.1: manage missing data## [1] "numeric data missing in : "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in : "
## WordCount Popular WordCount.log1p
## 109 5439 109
## WordCount.root2 WordCount.nexp PubDate.wkday.fctr
## 109 2044 378
## PubDate.wkend PubDate.hlday PubDate.day.minutes
## 7787 8160 5
## PubDate.last2.log1p PubDate.last4.log1p PubDate.last8.log1p
## 2 4 8
## PubDate.last16.log1p PubDate.last32.log1p PubDate.last2.log1p.ctg
## 16 32 42
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg PubDate.last16.log1p.ctg
## 84 168 336
## PubDate.last32.log1p.ctg
## 670
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate NDSSName.my
## 17 0 0
## [1] "numeric data missing in : "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in : "
## WordCount Popular WordCount.log1p
## 109 5439 109
## WordCount.root2 WordCount.nexp PubDate.wkday.fctr
## 109 2044 378
## PubDate.wkend PubDate.hlday PubDate.day.minutes
## 7787 8160 5
## PubDate.last2.log1p PubDate.last4.log1p PubDate.last8.log1p
## 2 4 8
## PubDate.last16.log1p PubDate.last32.log1p PubDate.last2.log1p.ctg
## 16 32 42
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg PubDate.last16.log1p.ctg
## 84 168 336
## PubDate.last32.log1p.ctg
## 670
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate NDSSName.my
## 17 0 0
## label step_major step_minor label_minor bgn end
## 6 manage.missing.data 3 1 1 34.987 36.13
## 7 cluster.data 3 2 2 36.130 NA
## elapsed
## 6 1.143
## 7 NA
3.2: cluster datamycompute_entropy_df <- function(obs_df, entropy_var, by_var=NULL) {
require(lazyeval)
require(dplyr)
require(tidyr)
if (is.null(by_var)) {
by_var <- ".default"
obs_df$.default <- as.factor(".default")
}
if (!any(grepl(".clusterid", names(obs_df), fixed=TRUE)))
obs_df$.clusterid <- 1
cluster_df <- obs_df %>%
count_(c(by_var, ".clusterid", entropy_var)) %>%
dplyr::filter(n > 0) %>%
dplyr::filter_(interp(~(!is.na(var)), var=as.name(entropy_var))) %>%
unite_(paste0(by_var, ".clusterid"),
c(interp(by_var), ".clusterid")) %>%
spread_(interp(entropy_var), "n", fill=0)
# head(cluster_df)
# sum(cluster_df$n)
tmp.entropy <- sapply(1:nrow(cluster_df),
function(row) entropy(as.numeric(cluster_df[row, -1]), method = "ML"))
tmp.knt <- sapply(1:nrow(cluster_df),
function(row) sum(as.numeric(cluster_df[row, -1])))
cluster_df$.entropy <- tmp.entropy; cluster_df$.knt <- tmp.knt
#print(cluster_df)
return(cluster_df)
}
if (glb_cluster) {
require(proxy)
#require(hash)
require(dynamicTreeCut)
require(entropy)
require(tidyr)
require(ggdendro)
mywgtdcosine_dist <- function(x, y=NULL, weights=NULL) {
if (!inherits(x, "matrix"))
x <- as.matrix(x)
if (is.null(weights))
weights <- rep(1, ncol(x))
wgtsx <- matrix(rep(weights / sum(weights), nrow(x)), nrow = nrow(x),
byrow = TRUE)
wgtdx <- x * wgtsx
wgtdxsqsum <- as.matrix(rowSums((x ^ 2) * wgtsx), byrow=FALSE)
denom <- sqrt(wgtdxsqsum %*% t(wgtdxsqsum))
ret_mtrx <- 1 - ((sum(weights) ^ 1) * (wgtdx %*% t(wgtdx)) / denom)
ret_mtrx[is.nan(ret_mtrx)] <- 1
diag(ret_mtrx) <- 0
return(ret_mtrx)
}
#pr_DB$delete_entry("mywgtdcosine");
# Need to do this only once across runs ?
if (!pr_DB$entry_exists("mywgtdcosine")) {
pr_DB$set_entry(FUN = mywgtdcosine_dist, names = c("mywgtdcosine"))
pr_DB$modify_entry(names="mywgtdcosine", type="metric", loop=FALSE)
}
#pr_DB$get_entry("mywgtdcosine")
# glb_hash <- hash(key=unique(glbObsAll$myCategory),
# values=1:length(unique(glbObsAll$myCategory)))
# glb_hash_lst <- hash(key=unique(glbObsAll$myCategory),
# values=1:length(unique(glbObsAll$myCategory)))
#stop(here"); glb2Sav(); glbObsAll <- savObsAll
cluster_vars <- grep(paste0("[",
toupper(paste0(substr(glbFeatsText, 1, 1), collapse = "")),
"]\\.[PT]\\."),
names(glbObsAll), value = TRUE)
# Assign correlations with rsp_var as weights for cosine distance
print("Clustering features: ")
cluster_vars_df <- data.frame(abs.cor.y = abs(cor(
glbObsAll[glbObsAll$.src == "Train", cluster_vars],
as.numeric(glbObsAll[glbObsAll$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs")))
print(tail(cluster_vars_df <- orderBy(~ abs.cor.y,
subset(cluster_vars_df, !is.na(abs.cor.y))), 5))
print(sprintf(" .rnorm cor: %0.4f",
cor(glbObsAll[glbObsAll$.src == "Train", ".rnorm"],
as.numeric(glbObsAll[glbObsAll$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs")))
print(sprintf("glbObsAll Entropy: %0.4f",
allobs_ent <- entropy(table(glbObsAll[, glb_cluster_entropy_var]),
method="ML")))
print(category_df <- mycompute_entropy_df(obs_df=glbObsAll,
entropy_var=glb_cluster_entropy_var,
by_var=glbFeatsCategory))
print(sprintf("glbObsAll$%s Entropy: %0.4f (%0.4f pct)",
glbFeatsCategory,
category_ent <- weighted.mean(category_df$.entropy, category_df$.knt),
100 * category_ent / allobs_ent))
glbObsAll$.clusterid <- 1
#print(max(table(glbObsAll$myCategory.fctr) / 20))
#stop(here"); glb2Sav()
grp_ids <- sort(unique(glbObsAll[, glbFeatsCategory]))
glb_cluster_size_df_lst <- list()
png(paste0(glb_out_pfx, "FeatsTxtClusters.png"),
width = 480 * 2, height = 480 * length(grp_ids))
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow = length(grp_ids), ncol = 2)))
pltIx <- 1
for (grp in grp_ids) {
# if (grep(grp, levels(grp_ids)) <= 6) next
# if (grep(grp, levels(grp_ids)) > 9) next
# if (grep(grp, levels(grp_ids)) != 10) next
print(sprintf("Category: %s", grp))
ctgry_allobs_df <- glbObsAll[glbObsAll[, glbFeatsCategory] == grp, ]
if (!inherits(ctgry_allobs_df[, glb_cluster_entropy_var], "factor"))
ctgry_allobs_df[, glb_cluster_entropy_var] <-
as.factor(ctgry_allobs_df[, glb_cluster_entropy_var])
#dstns_dist <- proxy::dist(ctgry_allobs_df[, cluster_vars], method = "cosine")
dstns_dist <- proxy::dist(ctgry_allobs_df[, row.names(cluster_vars_df)],
method = "mywgtdcosine",
weights = cluster_vars_df$abs.cor.y)
# Custom distance functions return a crossdist object
#dstns_mtrx <- as.matrix(dstns_dist)
dstns_mtrx <- matrix(as.vector(dstns_dist), nrow=attr(dstns_dist, "dim")[1],
dimnames=attr(dstns_dist, "dimnames"))
dstns_dist <- as.dist(dstns_mtrx)
print(sprintf("max distance(%0.4f) pair:", max(dstns_mtrx)))
# print(dim(dstns_mtrx))
# print(sprintf("which.max: %d", which.max(dstns_mtrx)))
row_ix <- ceiling(which.max(dstns_mtrx) / ncol(dstns_mtrx))
col_ix <- which.max(dstns_mtrx[row_ix, ])
# print(sprintf("row_ix: %d", row_ix)); print(sprintf("col_ix: %d", col_ix));
# print(dim(ctgry_allobs_df))
print(ctgry_allobs_df[c(row_ix, col_ix),
c(glb_id_var, glb_cluster_entropy_var, glbFeatsCategory, glbFeatsText, cluster_vars)])
min_dstns_mtrx <- dstns_mtrx
diag(min_dstns_mtrx) <- 1
# Float representations issue -2.22e-16 vs. 0.0000
print(sprintf("min distance(%0.4f) pair:", min(min_dstns_mtrx)))
row_ix <- ceiling(which.min(min_dstns_mtrx) / ncol(min_dstns_mtrx))
col_ix <- which.min(min_dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c(glb_id_var, glb_cluster_entropy_var, glbFeatsCategory, glbFeatsText,
cluster_vars)])
set.seed(glb_cluster.seed)
clusters <- hclust(dstns_dist, method = "ward.D2")
# Workaround to avoid "Error in cutree(dendro, h = heightcutoff) : the 'height' component of 'tree' is not sorted (increasingly)"
if (with(clusters,all.equal(height,sort(height))))
clusters$height <- round(clusters$height,6)
clusters$labels <- ctgry_allobs_df[, glb_id_var]
clustersDD <- dendro_data(clusters)
clustersDD$labels[, glb_rsp_var] <- sapply(clustersDD$labels$label, function(id)
ctgry_allobs_df[id == ctgry_allobs_df[, glb_id_var], glb_rsp_var])
print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
geom_point(data = clustersDD$labels,
aes_string(x = "x", color = glb_rsp_var), y = min(clustersDD$segments$y)) +
coord_flip(ylim = c(min(clustersDD$segments$y),
max(clustersDD$segments$y))) +
ggtitle(grp),
vp = viewport(layout.pos.row = pltIx, layout.pos.col = 1))
# clusters$labels <- ctgry_allobs_df[, glb_id_var]
# clustersDD <- dendro_data(clusters)
# clustersDD$labels$color <- sapply(clustersDD$labels$label, function(id)
# ctgry_allobs_df[id == ctgry_allobs_df[, glb_id_var], glb_rsp_var])
# print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
# geom_point(data = clustersDD$labels,
# aes_string(x = "x", color = "color"), y = min(clustersDD$segments$y)) +
# coord_flip(ylim = c(min(clustersDD$segments$y),
# max(clustersDD$segments$y))))
# print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
# geom_point(data = clustersDD$labels,
# aes_string(x = "x", y = "y", color = "color")))
# myplclust(clusters, lab=ctgry_allobs_df[, glb_id_var],
# lab.col=unclass(ctgry_allobs_df[, glb_cluster_entropy_var]))
opt_minclustersize_df <- data.frame(minclustersize = nrow(ctgry_allobs_df),
entropy = entropy(table(ctgry_allobs_df[, glb_cluster_entropy_var]),
method = "ML"))
for (minclustersize in
as.integer(seq(nrow(ctgry_allobs_df) / 2, nrow(ctgry_allobs_df) / 10,
length = 5))) {
clusterGroups <- cutreeDynamic(clusters, minClusterSize = minclustersize,
method = "tree", deepSplit = 0)
# Unassigned groups are labeled 0; the largest group has label 1
clusterGroups[clusterGroups == 0] <- 1
ctgry_allobs_df$.clusterid <- clusterGroups
ctgry_clstrs_df <- mycompute_entropy_df(ctgry_allobs_df,
glb_cluster_entropy_var)
opt_minclustersize_df <- rbind(opt_minclustersize_df,
data.frame(minclustersize = minclustersize,
entropy = weighted.mean(ctgry_clstrs_df$.entropy, ctgry_clstrs_df$.knt)))
}
opt_minclustersize <-
opt_minclustersize_df$minclustersize[which.min(opt_minclustersize_df$entropy)]
opt_minclustersize_df$.color <-
ifelse(opt_minclustersize_df$minclustersize == opt_minclustersize,
"red", "blue")
print(ggplot(data = opt_minclustersize_df,
mapping = aes(x = minclustersize, y = entropy)) +
geom_point(aes(color = .color)) + scale_color_identity() +
guides(color = "none") + geom_line(),
vp = viewport(layout.pos.row = pltIx, layout.pos.col = 2))
glb_cluster_size_df_lst[[grp]] <- opt_minclustersize_df
# select minclustersize that minimizes entropy
clusterGroups <- cutreeDynamic(clusters, minClusterSize = opt_minclustersize,
method = "tree",
deepSplit = 0)
# Unassigned groups are labeled 0; the largest group has label 1
table(clusterGroups, ctgry_allobs_df[, glb_cluster_entropy_var],
useNA = "ifany")
clusterGroups[clusterGroups == 0] <- 1
table(clusterGroups, ctgry_allobs_df[, glb_cluster_entropy_var], useNA = "ifany")
glbObsAll[glbObsAll[, glbFeatsCategory] == grp,]$.clusterid <-
clusterGroups
pltIx <- pltIx + 1
}
dev.off()
#all.equal(savObsAll_clusterid, glbObsAll$.clusterid)
print(cluster_df <- mycompute_entropy_df(obs_df=glbObsAll,
entropy_var=glb_cluster_entropy_var,
by_var=glbFeatsCategory))
print(sprintf("glbObsAll$%s$.clusterid Entropy: %0.4f (%0.4f pct)",
glbFeatsCategory,
cluster_ent <- weighted.mean(cluster_df$.entropy, cluster_df$.knt),
100 * cluster_ent / category_ent))
glbObsAll$.clusterid.fctr <- as.factor(glbObsAll$.clusterid)
# .clusterid.fctr is created automatically (probably ?) later
glbFeatsExclude <- c(glbFeatsExclude, ".clusterid")
if (!is.null(glbFeatsCategory))
# glbFeatsInteractionOnly[ifelse(grepl("\\.fctr", glbFeatsCategory),
# glbFeatsCategory,
# paste0(glbFeatsCategory, ".fctr"))] <-
# c(".clusterid.fctr")
glbFeatsInteractionOnly[[".clusterid.fctr"]] <-
ifelse(grepl("\\.fctr", glbFeatsCategory), glbFeatsCategory,
paste0(glbFeatsCategory, ".fctr"))
if (glbFeatsTextClusterVarsExclude)
glbFeatsExclude <- c(glbFeatsExclude, cluster_vars)
}
# Last call for data modifications
#stop(here") # savObsAll <- glbObsAll
# glbObsAll[(glbObsAll$PropR == 0.75) & (glbObsAll$State == "Hawaii"), "PropR.fctr"] <- "N"
# Re-partition
glbObsTrn <- subset(glbObsAll, .src == "Train")
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
glbObsNew <- subset(glbObsAll, .src == "Test")
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
glb_chunks_df <- myadd_chunk(glb_chunks_df, "partition.data.training", major.inc = TRUE)
## label step_major step_minor label_minor bgn end
## 7 cluster.data 3 2 2 36.130 36.454
## 8 partition.data.training 4 0 0 36.455 NA
## elapsed
## 7 0.324
## 8 NA
4.0: partition data training## [1] "Prediction Hints by Catgeory:"
## NDSSName.my.fctr Popular.0 Popular.1 .n.tst .strata.0 .strata.1
## 5 #U.S.#Education 325 NA 89 82 17
## 10 Culture## 1 NA 70 1 13
## 12 Foreign#World# 172 NA 47 44 9
## 21 myOther 38 NA 5 5 1
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Loading required package: sampling
##
## Attaching package: 'sampling'
##
## The following objects are masked from 'package:survival':
##
## cluster, strata
##
## The following object is masked from 'package:caret':
##
## cluster
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Popular.0 Popular.1 Popular.NA
## NA NA 1870
## Fit 3941 863 NA
## OOB 1498 230 NA
## Popular.0 Popular.1 Popular.NA
## NA NA 1
## Fit 0.8203580 0.1796420 NA
## OOB 0.8668981 0.1331019 NA
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## NDSSName.my.fctr .n.Fit .n.OOB .n.Tst .freqRatio.Fit
## 1 ## 913 371 342 0.190049958
## 6 Business#BusinessDay#Dealbook 629 323 304 0.130932556
## 11 Culture#Arts# 490 185 174 0.101998335
## 15 OpEd#Opinion# 437 89 164 0.090965862
## 9 Business#Technology# 213 126 114 0.044338052
## 19 TStyle## 623 101 105 0.129683597
## 5 #U.S.#Education 243 82 89 0.050582848
## 10 Culture## NA 1 70 NA
## 14 Metro#N.Y./Region# 128 70 67 0.026644463
## 18 Styles#U.S.# 127 50 61 0.026436303
## 16 Science#Health# 148 48 57 0.030807660
## 13 Foreign#World#AsiaPacific 150 53 56 0.031223980
## 2 #Multimedia# 92 49 52 0.019150708
## 12 Foreign#World# 128 44 47 0.026644463
## 8 Business#Crosswords/Games# 105 18 42 0.021856786
## 7 Business#BusinessDay#SmallBusiness 100 40 41 0.020815987
## 20 Travel#Travel# 83 34 35 0.017277269
## 3 #Opinion#RoomForDebate 42 20 20 0.008742714
## 17 Styles##Fashion 104 15 15 0.021648626
## 4 #Opinion#ThePublicEditor 16 4 10 0.003330558
## 21 myOther 33 5 5 0.006869276
## .freqRatio.OOB .freqRatio.Tst
## 1 0.2146990741 0.182887701
## 6 0.1869212963 0.162566845
## 11 0.1070601852 0.093048128
## 15 0.0515046296 0.087700535
## 9 0.0729166667 0.060962567
## 19 0.0584490741 0.056149733
## 5 0.0474537037 0.047593583
## 10 0.0005787037 0.037433155
## 14 0.0405092593 0.035828877
## 18 0.0289351852 0.032620321
## 16 0.0277777778 0.030481283
## 13 0.0306712963 0.029946524
## 2 0.0283564815 0.027807487
## 12 0.0254629630 0.025133690
## 8 0.0104166667 0.022459893
## 7 0.0231481481 0.021925134
## 20 0.0196759259 0.018716578
## 3 0.0115740741 0.010695187
## 17 0.0086805556 0.008021390
## 4 0.0023148148 0.005347594
## 21 0.0028935185 0.002673797
## [1] "glbObsAll: "
## [1] 8402 53
## [1] "glbObsTrn: "
## [1] 6532 53
## [1] "glbObsFit: "
## [1] 4804 52
## [1] "glbObsOOB: "
## [1] 1728 52
## [1] "glbObsNew: "
## [1] 1870 52
## Warning in rm(split): object 'split' not found
## label step_major step_minor label_minor bgn end
## 8 partition.data.training 4 0 0 36.455 37.871
## 9 select.features 5 0 0 37.872 NA
## elapsed
## 8 1.417
## 9 NA
5.0: select features## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## id cor.y
## Popular Popular 1.000000000
## WordCount.root2 WordCount.root2 0.292120679
## WordCount WordCount 0.257526549
## WordCount.log1p WordCount.log1p 0.254319628
## NDSSName.my.fctr NDSSName.my.fctr 0.165445970
## PubDate.day.minutes PubDate.day.minutes 0.156753478
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.hour.fctr PubDate.hour.fctr 0.135436805
## PubDate.wkend PubDate.wkend 0.104707290
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.2 0.070977720
## PubDate.last4.log1p PubDate.last4.log1p 0.066473282
## PubDate.last2.log1p PubDate.last2.log1p 0.063068716
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.last8.log1p PubDate.last8.log1p 0.054458821
## WordCount.nexp WordCount.nexp -0.053208396
## PubDate.last16.log1p PubDate.last16.log1p 0.040735543
## PubDate.wkday.fctr PubDate.wkday.fctr -0.039801288
## PubDate.minute.fctr PubDate.minute.fctr -0.034073846
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.zoo.ctg PubDate.zoo.ctg 0.022782795
## PubDate.month.fctr PubDate.month.fctr 0.019148739
## PubDate.POSIX PubDate.POSIX 0.015683258
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.hlday PubDate.hlday 0.014690122
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.zoo PubDate.zoo 0.013260902
## PubDate.second.fctr PubDate.second.fctr -0.011879458
## UniqueID UniqueID 0.011824920
## PubDate.date.fctr PubDate.date.fctr -0.011647558
## .rnorm .rnorm 0.008212201
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.2.ctg 0.003596414
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.year.fctr PubDate.year.fctr NA
## exclude.as.feat cor.y.abs
## Popular 1 1.000000000
## WordCount.root2 0 0.292120679
## WordCount 1 0.257526549
## WordCount.log1p 0 0.254319628
## NDSSName.my.fctr 0 0.165445970
## PubDate.day.minutes 1 0.156753478
## PubDate.day.minutes.poly.1 0 0.156753478
## PubDate.hour.fctr 0 0.135436805
## PubDate.wkend 0 0.104707290
## PubDate.day.minutes.poly.4 0 0.073941394
## PubDate.day.minutes.poly.2 0 0.070977720
## PubDate.last4.log1p 0 0.066473282
## PubDate.last2.log1p 0 0.063068716
## PubDate.day.minutes.poly.5 0 0.055929231
## PubDate.last8.log1p 0 0.054458821
## WordCount.nexp 0 0.053208396
## PubDate.last16.log1p 0 0.040735543
## PubDate.wkday.fctr 0 0.039801288
## PubDate.minute.fctr 0 0.034073846
## PubDate.day.minutes.poly.3 0 0.027983551
## PubDate.zoo.ctg 1 0.022782795
## PubDate.month.fctr 0 0.019148739
## PubDate.POSIX 1 0.015683258
## PubDate.last32.log1p.ctg 0 0.015395971
## PubDate.day.minutes.poly.3.ctg 0 0.014982807
## PubDate.hlday 0 0.014690122
## PubDate.day.minutes.poly.4.ctg 0 0.014601521
## PubDate.day.minutes.poly.5.ctg 0 0.014574775
## PubDate.juliandate 0 0.014361075
## PubDate.zoo 1 0.013260902
## PubDate.second.fctr 0 0.011879458
## UniqueID 1 0.011824920
## PubDate.date.fctr 0 0.011647558
## .rnorm 0 0.008212201
## PubDate.last16.log1p.ctg 0 0.007783530
## PubDate.last2.log1p.ctg 0 0.006916600
## PubDate.last4.log1p.ctg 0 0.004792781
## PubDate.last8.log1p.ctg 0 0.003914960
## PubDate.day.minutes.poly.2.ctg 0 0.003596414
## PubDate.last32.log1p 0 0.003558081
## PubDate.day.minutes.poly.1.ctg 0 0.002432289
## PubDate.year.fctr 0 NA
## [1] "cor(PubDate.juliandate, PubDate.month.fctr)=0.9393"
## [1] "cor(Popular.fctr, PubDate.juliandate)=0.0144"
## [1] "cor(Popular.fctr, PubDate.month.fctr)=0.0191"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.juliandate as highly correlated with
## PubDate.month.fctr
## [1] "cor(PubDate.day.minutes.poly.1, PubDate.hour.fctr)=0.9026"
## [1] "cor(Popular.fctr, PubDate.day.minutes.poly.1)=0.1568"
## [1] "cor(Popular.fctr, PubDate.hour.fctr)=0.1354"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.hour.fctr as highly correlated with
## PubDate.day.minutes.poly.1
## [1] "cor(WordCount.log1p, WordCount.root2)=0.8906"
## [1] "cor(Popular.fctr, WordCount.log1p)=0.2543"
## [1] "cor(Popular.fctr, WordCount.root2)=0.2921"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified WordCount.log1p as highly correlated with
## WordCount.root2
## [1] "cor(PubDate.last4.log1p, PubDate.last8.log1p)=0.8253"
## [1] "cor(Popular.fctr, PubDate.last4.log1p)=0.0665"
## [1] "cor(Popular.fctr, PubDate.last8.log1p)=0.0545"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.last8.log1p as highly correlated with
## PubDate.last4.log1p
## [1] "cor(PubDate.last2.log1p, PubDate.last4.log1p)=0.7598"
## [1] "cor(Popular.fctr, PubDate.last2.log1p)=0.0631"
## [1] "cor(Popular.fctr, PubDate.last4.log1p)=0.0665"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.last2.log1p as highly correlated with
## PubDate.last4.log1p
## id cor.y
## Popular Popular 1.000000000
## WordCount.root2 WordCount.root2 0.292120679
## WordCount WordCount 0.257526549
## WordCount.log1p WordCount.log1p 0.254319628
## NDSSName.my.fctr NDSSName.my.fctr 0.165445970
## PubDate.day.minutes PubDate.day.minutes 0.156753478
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.hour.fctr PubDate.hour.fctr 0.135436805
## PubDate.wkend PubDate.wkend 0.104707290
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.2 0.070977720
## PubDate.last4.log1p PubDate.last4.log1p 0.066473282
## PubDate.last2.log1p PubDate.last2.log1p 0.063068716
## PubDate.last8.log1p PubDate.last8.log1p 0.054458821
## PubDate.last16.log1p PubDate.last16.log1p 0.040735543
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.zoo.ctg PubDate.zoo.ctg 0.022782795
## PubDate.month.fctr PubDate.month.fctr 0.019148739
## PubDate.POSIX PubDate.POSIX 0.015683258
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.hlday PubDate.hlday 0.014690122
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.zoo PubDate.zoo 0.013260902
## UniqueID UniqueID 0.011824920
## .rnorm .rnorm 0.008212201
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.2.ctg 0.003596414
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.date.fctr PubDate.date.fctr -0.011647558
## PubDate.second.fctr PubDate.second.fctr -0.011879458
## PubDate.minute.fctr PubDate.minute.fctr -0.034073846
## PubDate.wkday.fctr PubDate.wkday.fctr -0.039801288
## WordCount.nexp WordCount.nexp -0.053208396
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.year.fctr PubDate.year.fctr NA
## exclude.as.feat cor.y.abs
## Popular 1 1.000000000
## WordCount.root2 0 0.292120679
## WordCount 1 0.257526549
## WordCount.log1p 0 0.254319628
## NDSSName.my.fctr 0 0.165445970
## PubDate.day.minutes 1 0.156753478
## PubDate.day.minutes.poly.1 0 0.156753478
## PubDate.hour.fctr 0 0.135436805
## PubDate.wkend 0 0.104707290
## PubDate.day.minutes.poly.4 0 0.073941394
## PubDate.day.minutes.poly.2 0 0.070977720
## PubDate.last4.log1p 0 0.066473282
## PubDate.last2.log1p 0 0.063068716
## PubDate.last8.log1p 0 0.054458821
## PubDate.last16.log1p 0 0.040735543
## PubDate.day.minutes.poly.3 0 0.027983551
## PubDate.zoo.ctg 1 0.022782795
## PubDate.month.fctr 0 0.019148739
## PubDate.POSIX 1 0.015683258
## PubDate.last32.log1p.ctg 0 0.015395971
## PubDate.day.minutes.poly.3.ctg 0 0.014982807
## PubDate.hlday 0 0.014690122
## PubDate.day.minutes.poly.4.ctg 0 0.014601521
## PubDate.day.minutes.poly.5.ctg 0 0.014574775
## PubDate.juliandate 0 0.014361075
## PubDate.zoo 1 0.013260902
## UniqueID 1 0.011824920
## .rnorm 0 0.008212201
## PubDate.last16.log1p.ctg 0 0.007783530
## PubDate.last2.log1p.ctg 0 0.006916600
## PubDate.last4.log1p.ctg 0 0.004792781
## PubDate.last8.log1p.ctg 0 0.003914960
## PubDate.day.minutes.poly.2.ctg 0 0.003596414
## PubDate.last32.log1p 0 0.003558081
## PubDate.day.minutes.poly.1.ctg 0 0.002432289
## PubDate.date.fctr 0 0.011647558
## PubDate.second.fctr 0 0.011879458
## PubDate.minute.fctr 0 0.034073846
## PubDate.wkday.fctr 0 0.039801288
## WordCount.nexp 0 0.053208396
## PubDate.day.minutes.poly.5 0 0.055929231
## PubDate.year.fctr 0 NA
## cor.high.X freqRatio
## Popular <NA> 4.976212
## WordCount.root2 <NA> 2.315789
## WordCount <NA> 2.315789
## WordCount.log1p WordCount.root2 2.315789
## NDSSName.my.fctr <NA> 1.348739
## PubDate.day.minutes <NA> 1.225490
## PubDate.day.minutes.poly.1 <NA> 1.225490
## PubDate.hour.fctr PubDate.day.minutes.poly.1 1.835040
## PubDate.wkend <NA> 12.011952
## PubDate.day.minutes.poly.4 <NA> 1.225490
## PubDate.day.minutes.poly.2 <NA> 1.225490
## PubDate.last4.log1p <NA> 1.125000
## PubDate.last2.log1p PubDate.last4.log1p 1.375000
## PubDate.last8.log1p PubDate.last4.log1p 1.142857
## PubDate.last16.log1p <NA> 3.200000
## PubDate.day.minutes.poly.3 <NA> 1.225490
## PubDate.zoo.ctg <NA> 1.000000
## PubDate.month.fctr <NA> 1.017514
## PubDate.POSIX <NA> 1.000000
## PubDate.last32.log1p.ctg <NA> 239.000000
## PubDate.day.minutes.poly.3.ctg <NA> 1.083333
## PubDate.hlday <NA> 28.160714
## PubDate.day.minutes.poly.4.ctg <NA> 1.083333
## PubDate.day.minutes.poly.5.ctg <NA> 1.083333
## PubDate.juliandate PubDate.month.fctr 1.032520
## PubDate.zoo <NA> 1.000000
## UniqueID <NA> 1.000000
## .rnorm <NA> 1.000000
## PubDate.last16.log1p.ctg <NA> 60.000000
## PubDate.last2.log1p.ctg <NA> 5.000000
## PubDate.last4.log1p.ctg <NA> 20.000000
## PubDate.last8.log1p.ctg <NA> 40.000000
## PubDate.day.minutes.poly.2.ctg <NA> 1.083333
## PubDate.last32.log1p <NA> 8.000000
## PubDate.day.minutes.poly.1.ctg <NA> 1.083333
## PubDate.date.fctr <NA> 1.021394
## PubDate.second.fctr <NA> 1.018204
## PubDate.minute.fctr <NA> 1.483365
## PubDate.wkday.fctr <NA> 1.003268
## WordCount.nexp <NA> 17.761364
## PubDate.day.minutes.poly.5 <NA> 1.225490
## PubDate.year.fctr <NA> 0.000000
## percentUnique zeroVar nzv
## Popular 0.03061849 FALSE FALSE
## WordCount.root2 24.15799143 FALSE FALSE
## WordCount 24.15799143 FALSE FALSE
## WordCount.log1p 24.15799143 FALSE FALSE
## NDSSName.my.fctr 0.32149418 FALSE FALSE
## PubDate.day.minutes 18.08022045 FALSE FALSE
## PubDate.day.minutes.poly.1 18.08022045 FALSE FALSE
## PubDate.hour.fctr 0.04592774 FALSE FALSE
## PubDate.wkend 0.03061849 FALSE FALSE
## PubDate.day.minutes.poly.4 18.08022045 FALSE FALSE
## PubDate.day.minutes.poly.2 18.08022045 FALSE FALSE
## PubDate.last4.log1p 64.98775260 FALSE FALSE
## PubDate.last2.log1p 51.17881200 FALSE FALSE
## PubDate.last8.log1p 75.12247397 FALSE FALSE
## PubDate.last16.log1p 84.44580527 FALSE FALSE
## PubDate.day.minutes.poly.3 18.08022045 FALSE FALSE
## PubDate.zoo.ctg 99.92345377 FALSE FALSE
## PubDate.month.fctr 0.04592774 FALSE FALSE
## PubDate.POSIX 99.86221678 FALSE FALSE
## PubDate.last32.log1p.ctg 92.11573791 FALSE FALSE
## PubDate.day.minutes.poly.3.ctg 53.96509492 FALSE FALSE
## PubDate.hlday 0.03061849 FALSE TRUE
## PubDate.day.minutes.poly.4.ctg 53.94978567 FALSE FALSE
## PubDate.day.minutes.poly.5.ctg 53.94978567 FALSE FALSE
## PubDate.juliandate 1.39314146 FALSE FALSE
## PubDate.zoo 99.86221678 FALSE FALSE
## UniqueID 100.00000000 FALSE FALSE
## .rnorm 100.00000000 FALSE FALSE
## PubDate.last16.log1p.ctg 95.17758726 FALSE FALSE
## PubDate.last2.log1p.ctg 92.19228414 FALSE FALSE
## PubDate.last4.log1p.ctg 95.88181261 FALSE FALSE
## PubDate.last8.log1p.ctg 96.41763625 FALSE FALSE
## PubDate.day.minutes.poly.2.ctg 53.94978567 FALSE FALSE
## PubDate.last32.log1p 90.99816289 FALSE FALSE
## PubDate.day.minutes.poly.1.ctg 53.96509492 FALSE FALSE
## PubDate.date.fctr 0.07654623 FALSE FALSE
## PubDate.second.fctr 0.06123699 FALSE FALSE
## PubDate.minute.fctr 0.06123699 FALSE FALSE
## PubDate.wkday.fctr 0.10716473 FALSE FALSE
## WordCount.nexp 11.32884262 FALSE FALSE
## PubDate.day.minutes.poly.5 18.08022045 FALSE FALSE
## PubDate.year.fctr 0.01530925 TRUE TRUE
## is.cor.y.abs.low
## Popular FALSE
## WordCount.root2 FALSE
## WordCount FALSE
## WordCount.log1p FALSE
## NDSSName.my.fctr FALSE
## PubDate.day.minutes FALSE
## PubDate.day.minutes.poly.1 FALSE
## PubDate.hour.fctr FALSE
## PubDate.wkend FALSE
## PubDate.day.minutes.poly.4 FALSE
## PubDate.day.minutes.poly.2 FALSE
## PubDate.last4.log1p FALSE
## PubDate.last2.log1p FALSE
## PubDate.last8.log1p FALSE
## PubDate.last16.log1p FALSE
## PubDate.day.minutes.poly.3 FALSE
## PubDate.zoo.ctg FALSE
## PubDate.month.fctr FALSE
## PubDate.POSIX FALSE
## PubDate.last32.log1p.ctg FALSE
## PubDate.day.minutes.poly.3.ctg FALSE
## PubDate.hlday FALSE
## PubDate.day.minutes.poly.4.ctg FALSE
## PubDate.day.minutes.poly.5.ctg FALSE
## PubDate.juliandate FALSE
## PubDate.zoo FALSE
## UniqueID FALSE
## .rnorm FALSE
## PubDate.last16.log1p.ctg TRUE
## PubDate.last2.log1p.ctg TRUE
## PubDate.last4.log1p.ctg TRUE
## PubDate.last8.log1p.ctg TRUE
## PubDate.day.minutes.poly.2.ctg TRUE
## PubDate.last32.log1p TRUE
## PubDate.day.minutes.poly.1.ctg TRUE
## PubDate.date.fctr FALSE
## PubDate.second.fctr FALSE
## PubDate.minute.fctr FALSE
## PubDate.wkday.fctr FALSE
## WordCount.nexp FALSE
## PubDate.day.minutes.poly.5 FALSE
## PubDate.year.fctr NA
## Warning in myplot_scatter(plt_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "nzv", : converting nzv to class:factor
## Warning: Removed 20 rows containing missing values (geom_point).
## Warning: Removed 20 rows containing missing values (geom_point).
## Warning: Removed 20 rows containing missing values (geom_point).
## id cor.y exclude.as.feat cor.y.abs
## PubDate.hlday PubDate.hlday 0.01469012 0 0.01469012
## PubDate.year.fctr PubDate.year.fctr NA 0 NA
## cor.high.X freqRatio percentUnique zeroVar nzv
## PubDate.hlday <NA> 28.16071 0.03061849 FALSE TRUE
## PubDate.year.fctr <NA> 0.00000 0.01530925 TRUE TRUE
## is.cor.y.abs.low
## PubDate.hlday FALSE
## PubDate.year.fctr NA
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## +(rfe) fit Fold1.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep1 size: 60
## +(rfe) imp Fold1.Rep1
## -(rfe) imp Fold1.Rep1
## +(rfe) fit Fold1.Rep1 size: 32
## -(rfe) fit Fold1.Rep1 size: 32
## +(rfe) fit Fold1.Rep1 size: 16
## -(rfe) fit Fold1.Rep1 size: 16
## +(rfe) fit Fold1.Rep1 size: 8
## -(rfe) fit Fold1.Rep1 size: 8
## +(rfe) fit Fold1.Rep1 size: 4
## -(rfe) fit Fold1.Rep1 size: 4
## +(rfe) fit Fold1.Rep1 size: 2
## -(rfe) fit Fold1.Rep1 size: 2
## +(rfe) fit Fold2.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep1 size: 60
## +(rfe) imp Fold2.Rep1
## -(rfe) imp Fold2.Rep1
## +(rfe) fit Fold2.Rep1 size: 32
## -(rfe) fit Fold2.Rep1 size: 32
## +(rfe) fit Fold2.Rep1 size: 16
## -(rfe) fit Fold2.Rep1 size: 16
## +(rfe) fit Fold2.Rep1 size: 8
## -(rfe) fit Fold2.Rep1 size: 8
## +(rfe) fit Fold2.Rep1 size: 4
## -(rfe) fit Fold2.Rep1 size: 4
## +(rfe) fit Fold2.Rep1 size: 2
## -(rfe) fit Fold2.Rep1 size: 2
## +(rfe) fit Fold3.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep1 size: 60
## +(rfe) imp Fold3.Rep1
## -(rfe) imp Fold3.Rep1
## +(rfe) fit Fold3.Rep1 size: 32
## -(rfe) fit Fold3.Rep1 size: 32
## +(rfe) fit Fold3.Rep1 size: 16
## -(rfe) fit Fold3.Rep1 size: 16
## +(rfe) fit Fold3.Rep1 size: 8
## -(rfe) fit Fold3.Rep1 size: 8
## +(rfe) fit Fold3.Rep1 size: 4
## -(rfe) fit Fold3.Rep1 size: 4
## +(rfe) fit Fold3.Rep1 size: 2
## -(rfe) fit Fold3.Rep1 size: 2
## +(rfe) fit Fold1.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep2 size: 60
## +(rfe) imp Fold1.Rep2
## -(rfe) imp Fold1.Rep2
## +(rfe) fit Fold1.Rep2 size: 32
## -(rfe) fit Fold1.Rep2 size: 32
## +(rfe) fit Fold1.Rep2 size: 16
## -(rfe) fit Fold1.Rep2 size: 16
## +(rfe) fit Fold1.Rep2 size: 8
## -(rfe) fit Fold1.Rep2 size: 8
## +(rfe) fit Fold1.Rep2 size: 4
## -(rfe) fit Fold1.Rep2 size: 4
## +(rfe) fit Fold1.Rep2 size: 2
## -(rfe) fit Fold1.Rep2 size: 2
## +(rfe) fit Fold2.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep2 size: 60
## +(rfe) imp Fold2.Rep2
## -(rfe) imp Fold2.Rep2
## +(rfe) fit Fold2.Rep2 size: 32
## -(rfe) fit Fold2.Rep2 size: 32
## +(rfe) fit Fold2.Rep2 size: 16
## -(rfe) fit Fold2.Rep2 size: 16
## +(rfe) fit Fold2.Rep2 size: 8
## -(rfe) fit Fold2.Rep2 size: 8
## +(rfe) fit Fold2.Rep2 size: 4
## -(rfe) fit Fold2.Rep2 size: 4
## +(rfe) fit Fold2.Rep2 size: 2
## -(rfe) fit Fold2.Rep2 size: 2
## +(rfe) fit Fold3.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep2 size: 60
## +(rfe) imp Fold3.Rep2
## -(rfe) imp Fold3.Rep2
## +(rfe) fit Fold3.Rep2 size: 32
## -(rfe) fit Fold3.Rep2 size: 32
## +(rfe) fit Fold3.Rep2 size: 16
## -(rfe) fit Fold3.Rep2 size: 16
## +(rfe) fit Fold3.Rep2 size: 8
## -(rfe) fit Fold3.Rep2 size: 8
## +(rfe) fit Fold3.Rep2 size: 4
## -(rfe) fit Fold3.Rep2 size: 4
## +(rfe) fit Fold3.Rep2 size: 2
## -(rfe) fit Fold3.Rep2 size: 2
## +(rfe) fit Fold1.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep3 size: 60
## +(rfe) imp Fold1.Rep3
## -(rfe) imp Fold1.Rep3
## +(rfe) fit Fold1.Rep3 size: 32
## -(rfe) fit Fold1.Rep3 size: 32
## +(rfe) fit Fold1.Rep3 size: 16
## -(rfe) fit Fold1.Rep3 size: 16
## +(rfe) fit Fold1.Rep3 size: 8
## -(rfe) fit Fold1.Rep3 size: 8
## +(rfe) fit Fold1.Rep3 size: 4
## -(rfe) fit Fold1.Rep3 size: 4
## +(rfe) fit Fold1.Rep3 size: 2
## -(rfe) fit Fold1.Rep3 size: 2
## +(rfe) fit Fold2.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep3 size: 60
## +(rfe) imp Fold2.Rep3
## -(rfe) imp Fold2.Rep3
## +(rfe) fit Fold2.Rep3 size: 32
## -(rfe) fit Fold2.Rep3 size: 32
## +(rfe) fit Fold2.Rep3 size: 16
## -(rfe) fit Fold2.Rep3 size: 16
## +(rfe) fit Fold2.Rep3 size: 8
## -(rfe) fit Fold2.Rep3 size: 8
## +(rfe) fit Fold2.Rep3 size: 4
## -(rfe) fit Fold2.Rep3 size: 4
## +(rfe) fit Fold2.Rep3 size: 2
## -(rfe) fit Fold2.Rep3 size: 2
## +(rfe) fit Fold3.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep3 size: 60
## +(rfe) imp Fold3.Rep3
## -(rfe) imp Fold3.Rep3
## +(rfe) fit Fold3.Rep3 size: 32
## -(rfe) fit Fold3.Rep3 size: 32
## +(rfe) fit Fold3.Rep3 size: 16
## -(rfe) fit Fold3.Rep3 size: 16
## +(rfe) fit Fold3.Rep3 size: 8
## -(rfe) fit Fold3.Rep3 size: 8
## +(rfe) fit Fold3.Rep3 size: 4
## -(rfe) fit Fold3.Rep3 size: 4
## +(rfe) fit Fold3.Rep3 size: 2
## -(rfe) fit Fold3.Rep3 size: 2
## Warning in lda.default(x, grouping, ...): variables are collinear
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (3 fold, repeated 3 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.8096 0.03708 0.005112 0.01788
## 4 0.8864 0.52518 0.005536 0.02625
## 8 0.8945 0.57850 0.013874 0.06859
## 16 0.9304 0.75912 0.004608 0.01694
## 32 0.9305 0.75960 0.004552 0.01678
## 60 0.9326 0.76882 0.004814 0.01705 *
##
## The top 5 variables (out of 60):
## WordCount.log1p, WordCount.root2, WordCount.nexp, NDSSName.my.fctrOpEd#Opinion#, PubDate.day.minutes.poly.1
##
## [1] "WordCount.log1p"
## [2] "WordCount.root2"
## [3] "WordCount.nexp"
## [4] "NDSSName.my.fctrOpEd#Opinion#"
## [5] "PubDate.day.minutes.poly.1"
## [6] "PubDate.day.minutes.poly.4"
## [7] "PubDate.hour.fctr(15.3,23]"
## [8] "NDSSName.my.fctrScience#Health#"
## [9] "PubDate.last4.log1p"
## [10] "PubDate.last2.log1p"
## [11] "NDSSName.my.fctrBusiness#Crosswords/Games#"
## [12] "NDSSName.my.fctrStyles#U.S.#"
## [13] "PubDate.last8.log1p"
## [14] "PubDate.day.minutes.poly.5"
## [15] "PubDate.wkend"
## [16] "NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg"
## [17] "PubDate.last16.log1p"
## [18] "PubDate.juliandate"
## [19] "PubDate.month.fctr11"
## [20] "PubDate.day.minutes.poly.3"
## [21] "PubDate.wkday.fctr6"
## [22] "PubDate.date.fctr(7,13]"
## [23] "PubDate.second.fctr(14.8,29.5]"
## [24] "PubDate.wkday.fctr1"
## [25] "PubDate.month.fctr10"
## [26] ".rnorm"
## [27] "PubDate.last32.log1p"
## [28] "NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg"
## [29] "PubDate.minute.fctr(44.2,59.1]"
## [30] "NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg"
## [31] "NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg"
## [32] "PubDate.day.minutes.poly.2"
## [33] "PubDate.hour.fctr(7.67,15.3]"
## [34] "PubDate.date.fctr(25,31]"
## [35] "PubDate.minute.fctr(14.8,29.5]"
## [36] "PubDate.second.fctr(44.2,59.1]"
## [37] "PubDate.wkday.fctr3"
## [38] "NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg"
## [39] "NDSSName.my.fctrmyOther"
## [40] "NDSSName.my.fctr#Opinion#RoomForDebate"
## [41] "PubDate.date.fctr(19,25]"
## [42] "NDSSName.my.fctrBusiness#Technology#"
## [43] "PubDate.wkday.fctr4"
## [44] "PubDate.second.fctr(29.5,44.2]"
## [45] "PubDate.date.fctr(13,19]"
## [46] "NDSSName.my.fctrMetro#N.Y./Region#"
## [47] "NDSSName.my.fctrTravel#Travel#"
## [48] "NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness"
## [49] "NDSSName.my.fctr#Multimedia#"
## [50] "PubDate.wkday.fctr2"
## [51] "NDSSName.my.fctrStyles##Fashion"
## [52] "NDSSName.my.fctrForeign#World#"
## [53] "PubDate.minute.fctr(29.5,44.2]"
## [54] "NDSSName.my.fctrForeign#World#AsiaPacific"
## [55] "PubDate.wkday.fctr5"
## [56] "NDSSName.my.fctr#U.S.#Education"
## [57] "NDSSName.my.fctrCulture#Arts#"
## [58] "NDSSName.my.fctrBusiness#BusinessDay#Dealbook"
## [59] "NDSSName.my.fctr##"
## [60] "NDSSName.my.fctrTStyle##"
## [1] "numeric data missing in : "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in : "
## WordCount Popular WordCount.log1p
## 109 5439 109
## WordCount.root2 WordCount.nexp PubDate.wkday.fctr
## 109 2044 378
## PubDate.wkend PubDate.hlday PubDate.day.minutes
## 7787 8160 5
## PubDate.last2.log1p PubDate.last4.log1p PubDate.last8.log1p
## 2 4 8
## PubDate.last16.log1p PubDate.last32.log1p PubDate.last2.log1p.ctg
## 16 32 42
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg PubDate.last16.log1p.ctg
## 84 168 336
## PubDate.last32.log1p.ctg
## 670
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate NDSSName.my .lcn
## 17 0 0 1870
## [1] "glb_feats_df:"
## [1] 42 12
## id exclude.as.feat rsp_var
## Popular.fctr Popular.fctr TRUE TRUE
## id cor.y exclude.as.feat cor.y.abs cor.high.X
## Popular Popular 1.00000000 TRUE 1.00000000 <NA>
## UniqueID UniqueID 0.01182492 TRUE 0.01182492 <NA>
## Popular.fctr Popular.fctr NA TRUE NA <NA>
## freqRatio percentUnique zeroVar nzv is.cor.y.abs.low
## Popular 4.976212 0.03061849 FALSE FALSE FALSE
## UniqueID 1.000000 100.00000000 FALSE FALSE FALSE
## Popular.fctr NA NA NA NA NA
## interaction.feat shapiro.test.p.value rsp_var_raw id_var
## Popular <NA> NA TRUE NA
## UniqueID <NA> NA FALSE TRUE
## Popular.fctr <NA> NA NA NA
## rsp_var
## Popular NA
## UniqueID NA
## Popular.fctr TRUE
## [1] "glb_feats_df vs. glbObsAll: "
## character(0)
## [1] "glbObsAll vs. glb_feats_df: "
## character(0)
## label step_major step_minor label_minor bgn end elapsed
## 9 select.features 5 0 0 37.872 63.608 25.736
## 10 fit.models 6 0 0 63.608 NA NA
6.0: fit modelsfit.models_0_chunk_df <- myadd_chunk(NULL, "fit.models_0_bgn", label.minor = "setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_0_bgn 1 0 setup 64.812 NA NA
# load(paste0(glb_out_pfx, "dsk.RData"))
get_model_sel_frmla <- function() {
model_evl_terms <- c(NULL)
# min.aic.fit might not be avl
lclMdlEvlCriteria <-
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)]
for (metric in lclMdlEvlCriteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse = " "))
return(model_sel_frmla)
}
get_dsp_models_df <- function() {
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
dsp_models_df <-
#orderBy(get_model_sel_frmla(), glb_models_df)[, c("id", glbMdlMetricsEval)]
orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols]
nCvMdl <- sapply(glb_models_lst, function(mdl) nrow(mdl$results))
nParams <- sapply(glb_models_lst, function(mdl) ifelse(mdl$method == "custom", 0,
nrow(subset(modelLookup(mdl$method), parameter != "parameter"))))
# nCvMdl <- nCvMdl[names(nCvMdl) != "avNNet"]
# nParams <- nParams[names(nParams) != "avNNet"]
if (length(cvMdlProblems <- nCvMdl[nCvMdl <= nParams]) > 0) {
print("Cross Validation issues:")
warning("Cross Validation issues:")
print(cvMdlProblems)
}
pltMdls <- setdiff(names(nCvMdl), names(cvMdlProblems))
pltMdls <- setdiff(pltMdls, names(nParams[nParams == 0]))
# length(pltMdls) == 21
png(paste0(glb_out_pfx, "bestTune.png"), width = 480 * 2, height = 480 * 4)
grid.newpage()
pushViewport(viewport(layout = grid.layout(ceiling(length(pltMdls) / 2.0), 2)))
pltIx <- 1
for (mdlId in pltMdls) {
print(ggplot(glb_models_lst[[mdlId]], highBestTune = TRUE) + labs(title = mdlId),
vp = viewport(layout.pos.row = ceiling(pltIx / 2.0),
layout.pos.col = ((pltIx - 1) %% 2) + 1))
pltIx <- pltIx + 1
}
dev.off()
return(dsp_models_df)
}
#get_dsp_models_df()
if (glb_is_classification && glb_is_binomial &&
(length(unique(glbObsFit[, glb_rsp_var])) < 2))
stop("glbObsFit$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glbObsFit[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !nzv & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
max_cor_y_x_vars <- max_cor_y_x_vars[!is.na(max_cor_y_x_vars)]
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[glb_feats_df$id == max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_feats_df$id == glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a higher correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Model specs
c("id.prefix", "method", "type",
# trainControl params
"preProc.method", "cv.n.folds", "cv.n.repeats", "summary.fn",
# train params
"metric", "metric.maximize", "tune.df")
## [1] "id.prefix" "method" "type"
## [4] "preProc.method" "cv.n.folds" "cv.n.repeats"
## [7] "summary.fn" "metric" "metric.maximize"
## [10] "tune.df"
# Baseline
if (!is.null(glb_Baseline_mdl_var)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Baseline"), major.inc = FALSE,
label.minor = "mybaseln_classfr")
ret_lst <- myfit_mdl(mdl_id="Baseline",
model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "MFO"), major.inc = FALSE,
label.minor = "myMFO_classfr")
## label step_major step_minor label_minor bgn end
## 1 fit.models_0_bgn 1 0 setup 64.812 64.845
## 2 fit.models_0_MFO 1 1 myMFO_classfr 64.846 NA
## elapsed
## 1 0.033
## 2 NA
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "MFO", type = glb_model_type, trainControl.method = "none",
train.method = ifelse(glb_is_regression, "lm", "myMFO_classfr"))),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "fitting model: MFO###myMFO_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] N Y
## Levels: N Y
## [1] "unique.prob:"
## y
## N Y
## 0.820358 0.179642
## [1] "MFO.val:"
## [1] "N"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 -none- numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.820358 0.179642
## 2 0.820358 0.179642
## 3 0.820358 0.179642
## 4 0.820358 0.179642
## 5 0.820358 0.179642
## 6 0.820358 0.179642
## Prediction
## Reference N Y
## N 0 3941
## Y 0 863
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1796420 0.0000000 0.1688795 0.1907952 0.8203580
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.820358 0.179642
## 2 0.820358 0.179642
## 3 0.820358 0.179642
## 4 0.820358 0.179642
## 5 0.820358 0.179642
## 6 0.820358 0.179642
## Prediction
## Reference N Y
## N 0 1498
## Y 0 230
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1331019 0.0000000 0.1174298 0.1500310 0.8668981
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## id feats max.nTuningRuns min.elapsedtime.everything
## 1 MFO###myMFO_classfr .rnorm 0 0.301
## min.elapsedtime.final max.AUCpROC.fit max.Sens.fit max.Spec.fit
## 1 0.003 0.5 1 0
## max.AUCROCR.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.1 0.3045703 0.179642
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.1688795 0.1907952 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.2349336 0.1331019
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.1174298 0.150031 0
if (glb_is_classification) {
# "random" model - only for classification;
# none needed for regression since it is same as MFO
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Random"), major.inc = FALSE,
label.minor = "myrandom_classfr")
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Random", type = glb_model_type, trainControl.method = "none",
train.method = "myrandom_classfr")),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## label step_major step_minor label_minor bgn end
## 2 fit.models_0_MFO 1 1 myMFO_classfr 64.846 68.049
## 3 fit.models_0_Random 1 2 myrandom_classfr 68.049 NA
## elapsed
## 2 3.203
## 3 NA
## [1] "fitting model: Random###myrandom_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] "in Random.Classifier$prob"
## Prediction
## Reference N Y
## N 0 3941
## Y 0 863
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1796420 0.0000000 0.1688795 0.1907952 0.8203580
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## [1] "in Random.Classifier$prob"
## Prediction
## Reference N Y
## N 0 1498
## Y 0 230
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1331019 0.0000000 0.1174298 0.1500310 0.8668981
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## id feats max.nTuningRuns
## 1 Random###myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 0.305 0.001 0.4990604
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.8312611 0.1668598 0.4972757 0.1
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.3045703 0.179642 0.1688795
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.1907952 0 0.5125675 0.8077437
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.2173913 0.4857956 0.1 0.2349336
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.1331019 0.1174298 0.150031
## max.Kappa.OOB
## 1 0
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.rcv.*X*"), major.inc = FALSE,
label.minor = "glmnet")
## label step_major step_minor label_minor
## 3 fit.models_0_Random 1 2 myrandom_classfr
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3 glmnet
## bgn end elapsed
## 3 68.049 72.629 4.58
## 4 72.630 NA NA
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y.rcv.1X1", type=glb_model_type, trainControl.method="none",
train.method="glmnet")),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Max.cor.Y.rcv.1X1###glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 2.0-2
## Fitting alpha = 0.1, lambda = 0.00434 on full training set
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -4.57159198
## NDSSName.my.fctr#Multimedia#
## -1.22219085
## NDSSName.my.fctr#Opinion#RoomForDebate
## -3.46072453
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.06871185
## NDSSName.my.fctr#U.S.#Education
## -1.89443632
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22472818
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.95537118
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.55408513
## NDSSName.my.fctrBusiness#Technology#
## 0.77368538
## NDSSName.my.fctrCulture#Arts#
## -0.09465691
## NDSSName.my.fctrForeign#World#
## -1.45528874
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.60117505
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.01563989
## NDSSName.my.fctrOpEd#Opinion#
## 4.51696382
## NDSSName.my.fctrScience#Health#
## 3.51595317
## NDSSName.my.fctrStyles##Fashion
## -1.85948925
## NDSSName.my.fctrStyles#U.S.#
## 3.27995325
## NDSSName.my.fctrTStyle##
## -1.54110404
## NDSSName.my.fctrTravel#Travel#
## -1.41940605
## NDSSName.my.fctrmyOther
## -1.90156922
## WordCount.root2
## 0.08434378
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -4.60394059
## NDSSName.my.fctr#Multimedia#
## -1.25163328
## NDSSName.my.fctr#Opinion#RoomForDebate
## -3.55521332
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.09217313
## NDSSName.my.fctr#U.S.#Education
## -1.96172971
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22495986
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.96836050
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.58120497
## NDSSName.my.fctrBusiness#Technology#
## 0.78504703
## NDSSName.my.fctrCulture#Arts#
## -0.09069661
## NDSSName.my.fctrForeign#World#
## -1.51061232
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.63313235
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.02466697
## NDSSName.my.fctrOpEd#Opinion#
## 4.54361134
## NDSSName.my.fctrScience#Health#
## 3.53210055
## NDSSName.my.fctrStyles##Fashion
## -1.92188290
## NDSSName.my.fctrStyles#U.S.#
## 3.29488750
## NDSSName.my.fctrTStyle##
## -1.57788931
## NDSSName.my.fctrTravel#Travel#
## -1.47368131
## NDSSName.my.fctrmyOther
## -1.97357582
## WordCount.root2
## 0.08537319
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1151 347
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.148374e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 8.593187e-43
## id feats
## 1 Max.cor.Y.rcv.1X1###glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 1.067 0.278
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8790544 0.9632073 0.7949015 0.9608594
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8099174 0.9329725
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7692476
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8116126
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4405405 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3148374
# rcv_n_folds == 1 & rcv_n_repeats > 1 crashes
for (rcv_n_folds in seq(3, glb_rcv_n_folds + 2, 2))
for (rcv_n_repeats in seq(1, glb_rcv_n_repeats + 2, 2)) {
# Experiment specific code to avoid caret crash
# lcl_tune_models_df <- rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha",
# vals = "0.100 0.325 0.550 0.775 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda",
# vals = "9.342e-02")
# )
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
list(
id.prefix = paste0("Max.cor.Y.rcv.", rcv_n_folds, "X", rcv_n_repeats),
type = glb_model_type,
# tune.df = lcl_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = rcv_n_folds,
trainControl.repeats = rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.method = "glmnet", train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize)),
indep_vars = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## [1] "fitting model: Max.cor.Y.rcv.3X1##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.325, lambda = 0.0201 on full training set
## Length Class Mode
## a0 99 -none- numeric
## beta 2079 dgCMatrix S4
## df 99 -none- numeric
## dim 2 -none- numeric
## lambda 99 -none- numeric
## dev.ratio 99 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.89350373
## NDSSName.my.fctr#Multimedia#
## -0.01916344
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.18453357
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.21701058
## NDSSName.my.fctr#U.S.#Education
## -0.47679040
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.09891374
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.87281404
## NDSSName.my.fctrBusiness#Technology#
## 0.40965256
## NDSSName.my.fctrForeign#World#
## -0.05114617
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.47464340
## NDSSName.my.fctrOpEd#Opinion#
## 3.95214357
## NDSSName.my.fctrScience#Health#
## 3.14232408
## NDSSName.my.fctrStyles##Fashion
## -0.31867093
## NDSSName.my.fctrStyles#U.S.#
## 2.92610567
## NDSSName.my.fctrTStyle##
## -0.60538025
## WordCount.root2
## 0.05783392
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.95644632
## NDSSName.my.fctr#Multimedia#
## -0.07182859
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.30034382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.29694236
## NDSSName.my.fctr#U.S.#Education
## -0.53415905
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.14259759
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.94231812
## NDSSName.my.fctrBusiness#Technology#
## 0.45657914
## NDSSName.my.fctrForeign#World#
## -0.10021084
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.53077048
## NDSSName.my.fctrOpEd#Opinion#
## 4.01324666
## NDSSName.my.fctrScience#Health#
## 3.18936803
## NDSSName.my.fctrStyles##Fashion
## -0.38069674
## NDSSName.my.fctrStyles#U.S.#
## 2.97176051
## NDSSName.my.fctrTStyle##
## -0.64837249
## WordCount.root2
## 0.05978318
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1146 352
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.575231e-01 3.107477e-01 7.365992e-01 7.775689e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.066396e-44
## id feats
## 1 Max.cor.Y.rcv.3X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.563 0.277
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8767919 0.964476 0.7891078 0.9582555
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8099174 0.9335973
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7691678
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8067975
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4375839 0.7575231
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7365992 0.7775689 0.3107477
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.007015493 0.02403706
## [1] "fitting model: Max.cor.Y.rcv.3X3##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.325, lambda = 0.0201 on full training set
## Length Class Mode
## a0 99 -none- numeric
## beta 2079 dgCMatrix S4
## df 99 -none- numeric
## dim 2 -none- numeric
## lambda 99 -none- numeric
## dev.ratio 99 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.89350373
## NDSSName.my.fctr#Multimedia#
## -0.01916344
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.18453357
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.21701058
## NDSSName.my.fctr#U.S.#Education
## -0.47679040
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.09891374
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.87281404
## NDSSName.my.fctrBusiness#Technology#
## 0.40965256
## NDSSName.my.fctrForeign#World#
## -0.05114617
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.47464340
## NDSSName.my.fctrOpEd#Opinion#
## 3.95214357
## NDSSName.my.fctrScience#Health#
## 3.14232408
## NDSSName.my.fctrStyles##Fashion
## -0.31867093
## NDSSName.my.fctrStyles#U.S.#
## 2.92610567
## NDSSName.my.fctrTStyle##
## -0.60538025
## WordCount.root2
## 0.05783392
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.95644632
## NDSSName.my.fctr#Multimedia#
## -0.07182859
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.30034382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.29694236
## NDSSName.my.fctr#U.S.#Education
## -0.53415905
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.14259759
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.94231812
## NDSSName.my.fctrBusiness#Technology#
## 0.45657914
## NDSSName.my.fctrForeign#World#
## -0.10021084
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.53077048
## NDSSName.my.fctrOpEd#Opinion#
## 4.01324666
## NDSSName.my.fctrScience#Health#
## 3.18936803
## NDSSName.my.fctrStyles##Fashion
## -0.38069674
## NDSSName.my.fctrStyles#U.S.#
## 2.97176051
## NDSSName.my.fctrTStyle##
## -0.64837249
## WordCount.root2
## 0.05978318
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1146 352
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.575231e-01 3.107477e-01 7.365992e-01 7.775689e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.066396e-44
## id feats
## 1 Max.cor.Y.rcv.3X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 4.736 0.278
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8767919 0.964476 0.7891078 0.9582555
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8099174 0.9333193
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7690803
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8067975
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4375839 0.7575231
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7365992 0.7775689 0.3107477
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005178375 0.01754365
## [1] "fitting model: Max.cor.Y.rcv.3X5##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.325, lambda = 0.0201 on full training set
## Length Class Mode
## a0 99 -none- numeric
## beta 2079 dgCMatrix S4
## df 99 -none- numeric
## dim 2 -none- numeric
## lambda 99 -none- numeric
## dev.ratio 99 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.89350373
## NDSSName.my.fctr#Multimedia#
## -0.01916344
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.18453357
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.21701058
## NDSSName.my.fctr#U.S.#Education
## -0.47679040
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.09891374
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.87281404
## NDSSName.my.fctrBusiness#Technology#
## 0.40965256
## NDSSName.my.fctrForeign#World#
## -0.05114617
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.47464340
## NDSSName.my.fctrOpEd#Opinion#
## 3.95214357
## NDSSName.my.fctrScience#Health#
## 3.14232408
## NDSSName.my.fctrStyles##Fashion
## -0.31867093
## NDSSName.my.fctrStyles#U.S.#
## 2.92610567
## NDSSName.my.fctrTStyle##
## -0.60538025
## WordCount.root2
## 0.05783392
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.95644632
## NDSSName.my.fctr#Multimedia#
## -0.07182859
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.30034382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.29694236
## NDSSName.my.fctr#U.S.#Education
## -0.53415905
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.14259759
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.94231812
## NDSSName.my.fctrBusiness#Technology#
## 0.45657914
## NDSSName.my.fctrForeign#World#
## -0.10021084
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.53077048
## NDSSName.my.fctrOpEd#Opinion#
## 4.01324666
## NDSSName.my.fctrScience#Health#
## 3.18936803
## NDSSName.my.fctrStyles##Fashion
## -0.38069674
## NDSSName.my.fctrStyles#U.S.#
## 2.97176051
## NDSSName.my.fctrTStyle##
## -0.64837249
## WordCount.root2
## 0.05978318
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1146 352
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.575231e-01 3.107477e-01 7.365992e-01 7.775689e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.066396e-44
## id feats
## 1 Max.cor.Y.rcv.3X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 7.231 0.278
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8767919 0.964476 0.7891078 0.9582555
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8099174 0.9332218
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7686375
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8067975
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4375839 0.7575231
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7365992 0.7775689 0.3107477
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005396525 0.01835474
## [1] "fitting model: Max.cor.Y.rcv.5X1##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0201 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.81141260
## NDSSName.my.fctr#Multimedia#
## -0.68105584
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.92624537
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.40699589
## NDSSName.my.fctr#U.S.#Education
## -0.98291999
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22577146
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.64343834
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.82797332
## NDSSName.my.fctrBusiness#Technology#
## 0.45317927
## NDSSName.my.fctrCulture#Arts#
## -0.17187706
## NDSSName.my.fctrForeign#World#
## -0.72035867
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.99018968
## NDSSName.my.fctrOpEd#Opinion#
## 3.81891156
## NDSSName.my.fctrScience#Health#
## 3.05516080
## NDSSName.my.fctrStyles##Fashion
## -0.97651721
## NDSSName.my.fctrStyles#U.S.#
## 2.84779285
## NDSSName.my.fctrTStyle##
## -0.94109645
## NDSSName.my.fctrTravel#Travel#
## -0.68827560
## NDSSName.my.fctrmyOther
## -0.84423735
## WordCount.root2
## 0.06115867
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.87108412
## NDSSName.my.fctr#Multimedia#
## -0.71588942
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.02010163
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.46715540
## NDSSName.my.fctr#U.S.#Education
## -1.02957582
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22558850
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.66798026
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.89132347
## NDSSName.my.fctrBusiness#Technology#
## 0.48212450
## NDSSName.my.fctrCulture#Arts#
## -0.16733777
## NDSSName.my.fctrForeign#World#
## -0.75793881
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.03076807
## NDSSName.my.fctrOpEd#Opinion#
## 3.87908175
## NDSSName.my.fctrScience#Health#
## 3.09788786
## NDSSName.my.fctrStyles##Fashion
## -1.02481879
## NDSSName.my.fctrStyles#U.S.#
## 2.88826078
## NDSSName.my.fctrTStyle##
## -0.97585470
## NDSSName.my.fctrTravel#Travel#
## -0.72668427
## NDSSName.my.fctrmyOther
## -0.90347045
## WordCount.root2
## 0.06289698
## Prediction
## Reference N Y
## N 3800 141
## Y 179 684
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.333888e-01 7.700473e-01 9.259666e-01 9.402789e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.097051e-115 3.860591e-02
## Prediction
## Reference N Y
## N 1137 361
## Y 53 177
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.373693e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.935747e-51
## id feats
## 1 Max.cor.Y.rcv.5X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 3.463 0.28
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8784031 0.9642223 0.792584 0.9607052
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8104265 0.9331818
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9259666 0.9402789 0.7689055
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8114863
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4609375 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3373693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008837283 0.03133449
## [1] "fitting model: Max.cor.Y.rcv.5X3##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0201 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.81141260
## NDSSName.my.fctr#Multimedia#
## -0.68105584
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.92624537
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.40699589
## NDSSName.my.fctr#U.S.#Education
## -0.98291999
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22577146
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.64343834
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.82797332
## NDSSName.my.fctrBusiness#Technology#
## 0.45317927
## NDSSName.my.fctrCulture#Arts#
## -0.17187706
## NDSSName.my.fctrForeign#World#
## -0.72035867
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.99018968
## NDSSName.my.fctrOpEd#Opinion#
## 3.81891156
## NDSSName.my.fctrScience#Health#
## 3.05516080
## NDSSName.my.fctrStyles##Fashion
## -0.97651721
## NDSSName.my.fctrStyles#U.S.#
## 2.84779285
## NDSSName.my.fctrTStyle##
## -0.94109645
## NDSSName.my.fctrTravel#Travel#
## -0.68827560
## NDSSName.my.fctrmyOther
## -0.84423735
## WordCount.root2
## 0.06115867
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.87108412
## NDSSName.my.fctr#Multimedia#
## -0.71588942
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.02010163
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.46715540
## NDSSName.my.fctr#U.S.#Education
## -1.02957582
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22558850
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.66798026
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.89132347
## NDSSName.my.fctrBusiness#Technology#
## 0.48212450
## NDSSName.my.fctrCulture#Arts#
## -0.16733777
## NDSSName.my.fctrForeign#World#
## -0.75793881
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.03076807
## NDSSName.my.fctrOpEd#Opinion#
## 3.87908175
## NDSSName.my.fctrScience#Health#
## 3.09788786
## NDSSName.my.fctrStyles##Fashion
## -1.02481879
## NDSSName.my.fctrStyles#U.S.#
## 2.88826078
## NDSSName.my.fctrTStyle##
## -0.97585470
## NDSSName.my.fctrTravel#Travel#
## -0.72668427
## NDSSName.my.fctrmyOther
## -0.90347045
## WordCount.root2
## 0.06289698
## Prediction
## Reference N Y
## N 3800 141
## Y 179 684
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.333888e-01 7.700473e-01 9.259666e-01 9.402789e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.097051e-115 3.860591e-02
## Prediction
## Reference N Y
## N 1137 361
## Y 53 177
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.373693e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.935747e-51
## id feats
## 1 Max.cor.Y.rcv.5X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 6.779 0.278
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8784031 0.9642223 0.792584 0.9607052
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8104265 0.9333905
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9259666 0.9402789 0.7698577
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8114863
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4609375 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3373693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.006138477 0.02161286
## [1] "fitting model: Max.cor.Y.rcv.5X5##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0201 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.81141260
## NDSSName.my.fctr#Multimedia#
## -0.68105584
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.92624537
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.40699589
## NDSSName.my.fctr#U.S.#Education
## -0.98291999
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22577146
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.64343834
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.82797332
## NDSSName.my.fctrBusiness#Technology#
## 0.45317927
## NDSSName.my.fctrCulture#Arts#
## -0.17187706
## NDSSName.my.fctrForeign#World#
## -0.72035867
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.99018968
## NDSSName.my.fctrOpEd#Opinion#
## 3.81891156
## NDSSName.my.fctrScience#Health#
## 3.05516080
## NDSSName.my.fctrStyles##Fashion
## -0.97651721
## NDSSName.my.fctrStyles#U.S.#
## 2.84779285
## NDSSName.my.fctrTStyle##
## -0.94109645
## NDSSName.my.fctrTravel#Travel#
## -0.68827560
## NDSSName.my.fctrmyOther
## -0.84423735
## WordCount.root2
## 0.06115867
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.87108412
## NDSSName.my.fctr#Multimedia#
## -0.71588942
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.02010163
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.46715540
## NDSSName.my.fctr#U.S.#Education
## -1.02957582
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22558850
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.66798026
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.89132347
## NDSSName.my.fctrBusiness#Technology#
## 0.48212450
## NDSSName.my.fctrCulture#Arts#
## -0.16733777
## NDSSName.my.fctrForeign#World#
## -0.75793881
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.03076807
## NDSSName.my.fctrOpEd#Opinion#
## 3.87908175
## NDSSName.my.fctrScience#Health#
## 3.09788786
## NDSSName.my.fctrStyles##Fashion
## -1.02481879
## NDSSName.my.fctrStyles#U.S.#
## 2.88826078
## NDSSName.my.fctrTStyle##
## -0.97585470
## NDSSName.my.fctrTravel#Travel#
## -0.72668427
## NDSSName.my.fctrmyOther
## -0.90347045
## WordCount.root2
## 0.06289698
## Prediction
## Reference N Y
## N 3800 141
## Y 179 684
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.333888e-01 7.700473e-01 9.259666e-01 9.402789e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.097051e-115 3.860591e-02
## Prediction
## Reference N Y
## N 1137 361
## Y 53 177
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.373693e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.935747e-51
## id feats
## 1 Max.cor.Y.rcv.5X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 9.27 0.28
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8784031 0.9642223 0.792584 0.9607052
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8104265 0.9331816
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9259666 0.9402789 0.7691429
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8114863
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4609375 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3373693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0062138 0.02210061
# Add parallel coordinates graph of glb_models_df[, glbMdlMetricsEval] to evaluate cv parameters
tmp_models_cols <- c("id", "max.nTuningRuns",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
print(myplot_parcoord(obs_df = subset(glb_models_df,
grepl("Max.cor.Y.rcv.", id, fixed = TRUE),
select = -feats)[, tmp_models_cols],
id_var = "id"))
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y[rcv.1X1.cp.0|]"), major.inc = FALSE,
label.minor = "rpart")
## label step_major step_minor label_minor
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3 glmnet
## 5 fit.models_0_Max.cor.Y[rcv.1X1.cp.0|] 1 4 rpart
## bgn end elapsed
## 4 72.630 150.481 77.851
## 5 150.482 NA NA
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y.rcv.1X1.cp.0", type=glb_model_type, trainControl.method="none",
train.method="rpart",
tune.df=data.frame(method="rpart", parameter="cp", min=0.0, max=0.0, by=0.1))),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Max.cor.Y.rcv.1X1.cp.0###rpart"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Loading required package: rpart
## Fitting cp = 0 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4804
##
## CP nsplit rel error
## 1 0.3696407879 0 1.0000000
## 2 0.0984936269 1 0.6303592
## 3 0.0857473928 2 0.5318656
## 4 0.0567786790 3 0.4461182
## 5 0.0104287370 4 0.3893395
## 6 0.0057937428 5 0.3789108
## 7 0.0034762457 7 0.3673233
## 8 0.0023174971 8 0.3638470
## 9 0.0011587486 11 0.3568946
## 10 0.0007724990 13 0.3545771
## 11 0.0005793743 16 0.3522596
## 12 0.0004213631 24 0.3476246
## 13 0.0003862495 35 0.3429896
## 14 0.0000000000 41 0.3406721
##
## Variable importance
## NDSSName.my.fctrOpEd#Opinion#
## 48
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 14
## NDSSName.my.fctrScience#Health#
## 14
## NDSSName.my.fctrStyles#U.S.#
## 11
## WordCount.root2
## 9
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## 1
##
## Node number 1: 4804 observations, complexity param=0.3696408
## predicted class=N expected loss=0.179642 P(node) =1
## class counts: 3941 863
## probabilities: 0.820 0.180
## left son=2 (4367 obs) right son=3 (437 obs)
## Primary splits:
## NDSSName.my.fctrOpEd#Opinion# < 0.5 to the left, improve=451.59770, (0 missing)
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=112.88510, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve=111.17610, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve= 99.35206, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 68.73272, (0 missing)
##
## Node number 2: 4367 observations, complexity param=0.09849363
## predicted class=N expected loss=0.1110602 P(node) =0.9090341
## class counts: 3882 485
## probabilities: 0.889 0.111
## left son=4 (4262 obs) right son=5 (105 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=135.55130, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=125.07920, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve= 94.70710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 88.56821, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 18.74400, (0 missing)
##
## Node number 3: 437 observations
## predicted class=Y expected loss=0.1350114 P(node) =0.09096586
## class counts: 59 378
## probabilities: 0.135 0.865
##
## Node number 4: 4262 observations, complexity param=0.08574739
## predicted class=N expected loss=0.09150634 P(node) =0.8871774
## class counts: 3872 390
## probabilities: 0.908 0.092
## left son=8 (4114 obs) right son=9 (148 obs)
## Primary splits:
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=132.96710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 94.69099, (0 missing)
## WordCount.root2 < 26.49528 to the left, improve= 84.07487, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 19.71762, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 10.17000, (0 missing)
##
## Node number 5: 105 observations, complexity param=0.002317497
## predicted class=Y expected loss=0.0952381 P(node) =0.02185679
## class counts: 10 95
## probabilities: 0.095 0.905
## left son=10 (12 obs) right son=11 (93 obs)
## Primary splits:
## WordCount.root2 < 18.9043 to the left, improve=6.455453, (0 missing)
##
## Node number 8: 4114 observations, complexity param=0.05677868
## predicted class=N expected loss=0.06781721 P(node) =0.8563697
## class counts: 3835 279
## probabilities: 0.932 0.068
## left son=16 (3987 obs) right son=17 (127 obs)
## Primary splits:
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve=102.410700, (0 missing)
## WordCount.root2 < 25.01 to the left, improve= 47.352210, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 20.930810, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 5.249425, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve= 2.395935, (0 missing)
##
## Node number 9: 148 observations, complexity param=0.01042874
## predicted class=Y expected loss=0.25 P(node) =0.03080766
## class counts: 37 111
## probabilities: 0.250 0.750
## left son=18 (55 obs) right son=19 (93 obs)
## Primary splits:
## WordCount.root2 < 22.72663 to the left, improve=19.274, (0 missing)
##
## Node number 10: 12 observations
## predicted class=N expected loss=0.4166667 P(node) =0.002497918
## class counts: 7 5
## probabilities: 0.583 0.417
##
## Node number 11: 93 observations
## predicted class=Y expected loss=0.03225806 P(node) =0.01935887
## class counts: 3 90
## probabilities: 0.032 0.968
##
## Node number 16: 3987 observations, complexity param=0.005793743
## predicted class=N expected loss=0.04790569 P(node) =0.8299334
## class counts: 3796 191
## probabilities: 0.952 0.048
## left son=32 (2982 obs) right son=33 (1005 obs)
## Primary splits:
## WordCount.root2 < 25.01 to the left, improve=29.253580, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve=21.978920, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve= 3.887348, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 2.348653, (0 missing)
## NDSSName.my.fctr#U.S.#Education < 0.5 to the right, improve= 1.187739, (0 missing)
## Surrogate splits:
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the left, agree=0.758, adj=0.042, (0 split)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the left, agree=0.752, adj=0.016, (0 split)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, agree=0.750, adj=0.008, (0 split)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the left, agree=0.748, adj=0.002, (0 split)
##
## Node number 17: 127 observations, complexity param=0.003476246
## predicted class=Y expected loss=0.3070866 P(node) =0.0264363
## class counts: 39 88
## probabilities: 0.307 0.693
## left son=34 (13 obs) right son=35 (114 obs)
## Primary splits:
## WordCount.root2 < 15.32846 to the left, improve=2.753047, (0 missing)
##
## Node number 18: 55 observations, complexity param=0.002317497
## predicted class=N expected loss=0.4181818 P(node) =0.01144879
## class counts: 32 23
## probabilities: 0.582 0.418
## left son=36 (9 obs) right son=37 (46 obs)
## Primary splits:
## WordCount.root2 < 19.93708 to the right, improve=0.8264383, (0 missing)
##
## Node number 19: 93 observations
## predicted class=Y expected loss=0.05376344 P(node) =0.01935887
## class counts: 5 88
## probabilities: 0.054 0.946
##
## Node number 32: 2982 observations
## predicted class=N expected loss=0.01274313 P(node) =0.6207327
## class counts: 2944 38
## probabilities: 0.987 0.013
##
## Node number 33: 1005 observations, complexity param=0.005793743
## predicted class=N expected loss=0.1522388 P(node) =0.2092007
## class counts: 852 153
## probabilities: 0.848 0.152
## left son=66 (993 obs) right son=67 (12 obs)
## Primary splits:
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve=14.193880, (0 missing)
## NDSSName.my.fctrCulture#Arts# < 0.5 to the left, improve= 3.669601, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve= 3.556158, (0 missing)
## WordCount.root2 < 34.19795 to the left, improve= 2.582851, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve= 2.031748, (0 missing)
##
## Node number 34: 13 observations
## predicted class=N expected loss=0.3846154 P(node) =0.002706078
## class counts: 8 5
## probabilities: 0.615 0.385
##
## Node number 35: 114 observations, complexity param=0.000772499
## predicted class=Y expected loss=0.2719298 P(node) =0.02373022
## class counts: 31 83
## probabilities: 0.272 0.728
## left son=70 (79 obs) right son=71 (35 obs)
## Primary splits:
## WordCount.root2 < 29.21444 to the left, improve=1.020279, (0 missing)
##
## Node number 36: 9 observations
## predicted class=N expected loss=0.2222222 P(node) =0.001873439
## class counts: 7 2
## probabilities: 0.778 0.222
##
## Node number 37: 46 observations, complexity param=0.002317497
## predicted class=N expected loss=0.4565217 P(node) =0.009575354
## class counts: 25 21
## probabilities: 0.543 0.457
## left son=74 (36 obs) right son=75 (10 obs)
## Primary splits:
## WordCount.root2 < 17.01454 to the left, improve=1.514976, (0 missing)
##
## Node number 66: 993 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.143001 P(node) =0.2067027
## class counts: 851 142
## probabilities: 0.857 0.143
## left son=132 (930 obs) right son=133 (63 obs)
## Primary splits:
## NDSSName.my.fctrCulture#Arts# < 0.5 to the left, improve=4.094729, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=3.106316, (0 missing)
## WordCount.root2 < 29.5127 to the left, improve=2.722793, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve=1.962300, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve=1.793603, (0 missing)
##
## Node number 67: 12 observations
## predicted class=Y expected loss=0.08333333 P(node) =0.002497918
## class counts: 1 11
## probabilities: 0.083 0.917
##
## Node number 70: 79 observations, complexity param=0.000772499
## predicted class=Y expected loss=0.3164557 P(node) =0.01644463
## class counts: 25 54
## probabilities: 0.316 0.684
## left son=140 (25 obs) right son=141 (54 obs)
## Primary splits:
## WordCount.root2 < 27.36786 to the right, improve=0.5105485, (0 missing)
##
## Node number 71: 35 observations
## predicted class=Y expected loss=0.1714286 P(node) =0.007285595
## class counts: 6 29
## probabilities: 0.171 0.829
##
## Node number 74: 36 observations, complexity param=0.001158749
## predicted class=N expected loss=0.3888889 P(node) =0.007493755
## class counts: 22 14
## probabilities: 0.611 0.389
## left son=148 (8 obs) right son=149 (28 obs)
## Primary splits:
## WordCount.root2 < 15.74773 to the right, improve=0.3968254, (0 missing)
##
## Node number 75: 10 observations
## predicted class=Y expected loss=0.3 P(node) =0.002081599
## class counts: 3 7
## probabilities: 0.300 0.700
##
## Node number 132: 930 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.1311828 P(node) =0.1935887
## class counts: 808 122
## probabilities: 0.869 0.131
## left son=264 (627 obs) right son=265 (303 obs)
## Primary splits:
## WordCount.root2 < 33.97057 to the left, improve=2.913816, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=2.586923, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve=2.402029, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve=1.513920, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=1.276783, (0 missing)
## Surrogate splits:
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the left, agree=0.719, adj=0.139, (0 split)
##
## Node number 133: 63 observations, complexity param=0.0003862495
## predicted class=N expected loss=0.3174603 P(node) =0.01311407
## class counts: 43 20
## probabilities: 0.683 0.317
## left son=266 (14 obs) right son=267 (49 obs)
## Primary splits:
## WordCount.root2 < 26.99984 to the left, improve=0.38322, (0 missing)
##
## Node number 140: 25 observations, complexity param=0.000772499
## predicted class=Y expected loss=0.4 P(node) =0.005203997
## class counts: 10 15
## probabilities: 0.400 0.600
## left son=280 (8 obs) right son=281 (17 obs)
## Primary splits:
## WordCount.root2 < 28.02674 to the left, improve=1.191176, (0 missing)
##
## Node number 141: 54 observations, complexity param=0.0003862495
## predicted class=Y expected loss=0.2777778 P(node) =0.01124063
## class counts: 15 39
## probabilities: 0.278 0.722
## left son=282 (45 obs) right son=283 (9 obs)
## Primary splits:
## WordCount.root2 < 26.55173 to the left, improve=0.6, (0 missing)
##
## Node number 148: 8 observations
## predicted class=N expected loss=0.25 P(node) =0.001665279
## class counts: 6 2
## probabilities: 0.750 0.250
##
## Node number 149: 28 observations, complexity param=0.001158749
## predicted class=N expected loss=0.4285714 P(node) =0.005828476
## class counts: 16 12
## probabilities: 0.571 0.429
## left son=298 (20 obs) right son=299 (8 obs)
## Primary splits:
## WordCount.root2 < 15.06648 to the left, improve=0.8642857, (0 missing)
##
## Node number 264: 627 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1036683 P(node) =0.1305162
## class counts: 562 65
## probabilities: 0.896 0.104
## left son=528 (561 obs) right son=529 (66 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve=2.8404170, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=1.0796950, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=1.0670160, (0 missing)
## WordCount.root2 < 29.5127 to the left, improve=0.8966879, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.4399337, (0 missing)
##
## Node number 265: 303 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.1881188 P(node) =0.06307244
## class counts: 246 57
## probabilities: 0.812 0.188
## left son=530 (222 obs) right son=531 (81 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=5.4890570, (0 missing)
## WordCount.root2 < 38.17067 to the right, improve=5.0156320, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve=3.4510070, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=1.5155860, (0 missing)
## NDSSName.my.fctr#U.S.#Education < 0.5 to the right, improve=0.8078801, (0 missing)
## Surrogate splits:
## WordCount.root2 < 34.08078 to the right, agree=0.739, adj=0.025, (0 split)
##
## Node number 266: 14 observations
## predicted class=N expected loss=0.2142857 P(node) =0.002914238
## class counts: 11 3
## probabilities: 0.786 0.214
##
## Node number 267: 49 observations, complexity param=0.0003862495
## predicted class=N expected loss=0.3469388 P(node) =0.01019983
## class counts: 32 17
## probabilities: 0.653 0.347
## left son=534 (10 obs) right son=535 (39 obs)
## Primary splits:
## WordCount.root2 < 41.56249 to the right, improve=0.5425432, (0 missing)
##
## Node number 280: 8 observations
## predicted class=N expected loss=0.375 P(node) =0.001665279
## class counts: 5 3
## probabilities: 0.625 0.375
##
## Node number 281: 17 observations
## predicted class=Y expected loss=0.2941176 P(node) =0.003538718
## class counts: 5 12
## probabilities: 0.294 0.706
##
## Node number 282: 45 observations, complexity param=0.0003862495
## predicted class=Y expected loss=0.3111111 P(node) =0.009367194
## class counts: 14 31
## probabilities: 0.311 0.689
## left son=564 (23 obs) right son=565 (22 obs)
## Primary splits:
## WordCount.root2 < 21.70252 to the right, improve=0.6050944, (0 missing)
##
## Node number 283: 9 observations
## predicted class=Y expected loss=0.1111111 P(node) =0.001873439
## class counts: 1 8
## probabilities: 0.111 0.889
##
## Node number 298: 20 observations
## predicted class=N expected loss=0.35 P(node) =0.004163197
## class counts: 13 7
## probabilities: 0.650 0.350
##
## Node number 299: 8 observations
## predicted class=Y expected loss=0.375 P(node) =0.001665279
## class counts: 3 5
## probabilities: 0.375 0.625
##
## Node number 528: 561 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.08734403 P(node) =0.1167777
## class counts: 512 49
## probabilities: 0.913 0.087
## left son=1056 (281 obs) right son=1057 (280 obs)
## Primary splits:
## WordCount.root2 < 29.33428 to the left, improve=1.5853030, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=0.7645570, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.7250433, (0 missing)
## NDSSName.my.fctrStyles##Fashion < 0.5 to the right, improve=0.3000638, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.2729836, (0 missing)
## Surrogate splits:
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the left, agree=0.560, adj=0.118, (0 split)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, agree=0.533, adj=0.064, (0 split)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, agree=0.524, adj=0.046, (0 split)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the left, agree=0.515, adj=0.029, (0 split)
## NDSSName.my.fctrStyles##Fashion < 0.5 to the right, agree=0.512, adj=0.021, (0 split)
##
## Node number 529: 66 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.2424242 P(node) =0.01373855
## class counts: 50 16
## probabilities: 0.758 0.242
## left son=1058 (38 obs) right son=1059 (28 obs)
## Primary splits:
## WordCount.root2 < 27.86575 to the left, improve=0.6070859, (0 missing)
##
## Node number 530: 222 observations
## predicted class=N expected loss=0.1306306 P(node) =0.04621149
## class counts: 193 29
## probabilities: 0.869 0.131
##
## Node number 531: 81 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.345679 P(node) =0.01686095
## class counts: 53 28
## probabilities: 0.654 0.346
## left son=1062 (15 obs) right son=1063 (66 obs)
## Primary splits:
## WordCount.root2 < 41.59766 to the right, improve=2.866218, (0 missing)
##
## Node number 534: 10 observations
## predicted class=N expected loss=0.2 P(node) =0.002081599
## class counts: 8 2
## probabilities: 0.800 0.200
##
## Node number 535: 39 observations, complexity param=0.0003862495
## predicted class=N expected loss=0.3846154 P(node) =0.008118235
## class counts: 24 15
## probabilities: 0.615 0.385
## left son=1070 (32 obs) right son=1071 (7 obs)
## Primary splits:
## WordCount.root2 < 34.23387 to the left, improve=0.595467, (0 missing)
##
## Node number 564: 23 observations, complexity param=0.0003862495
## predicted class=Y expected loss=0.3913043 P(node) =0.004787677
## class counts: 9 14
## probabilities: 0.391 0.609
## left son=1128 (7 obs) right son=1129 (16 obs)
## Primary splits:
## WordCount.root2 < 23.6326 to the left, improve=0.6529503, (0 missing)
##
## Node number 565: 22 observations
## predicted class=Y expected loss=0.2272727 P(node) =0.004579517
## class counts: 5 17
## probabilities: 0.227 0.773
##
## Node number 1056: 281 observations
## predicted class=N expected loss=0.04982206 P(node) =0.05849292
## class counts: 267 14
## probabilities: 0.950 0.050
##
## Node number 1057: 280 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.125 P(node) =0.05828476
## class counts: 245 35
## probabilities: 0.875 0.125
## left son=2114 (71 obs) right son=2115 (209 obs)
## Primary splits:
## WordCount.root2 < 32.57299 to the right, improve=0.8968765, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=0.7830739, (0 missing)
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.3683673, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.3578067, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.3021494, (0 missing)
##
## Node number 1058: 38 observations
## predicted class=N expected loss=0.1842105 P(node) =0.007910075
## class counts: 31 7
## probabilities: 0.816 0.184
##
## Node number 1059: 28 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.3214286 P(node) =0.005828476
## class counts: 19 9
## probabilities: 0.679 0.321
## left son=2118 (19 obs) right son=2119 (9 obs)
## Primary splits:
## WordCount.root2 < 28.6269 to the right, improve=1.454052, (0 missing)
##
## Node number 1062: 15 observations
## predicted class=N expected loss=0.06666667 P(node) =0.003122398
## class counts: 14 1
## probabilities: 0.933 0.067
##
## Node number 1063: 66 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4090909 P(node) =0.01373855
## class counts: 39 27
## probabilities: 0.591 0.409
## left son=2126 (25 obs) right son=2127 (41 obs)
## Primary splits:
## WordCount.root2 < 35.6581 to the left, improve=1.341286, (0 missing)
##
## Node number 1070: 32 observations
## predicted class=N expected loss=0.34375 P(node) =0.006661116
## class counts: 21 11
## probabilities: 0.656 0.344
##
## Node number 1071: 7 observations
## predicted class=Y expected loss=0.4285714 P(node) =0.001457119
## class counts: 3 4
## probabilities: 0.429 0.571
##
## Node number 1128: 7 observations
## predicted class=N expected loss=0.4285714 P(node) =0.001457119
## class counts: 4 3
## probabilities: 0.571 0.429
##
## Node number 1129: 16 observations
## predicted class=Y expected loss=0.3125 P(node) =0.003330558
## class counts: 5 11
## probabilities: 0.312 0.688
##
## Node number 2114: 71 observations
## predicted class=N expected loss=0.05633803 P(node) =0.01477935
## class counts: 67 4
## probabilities: 0.944 0.056
##
## Node number 2115: 209 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1483254 P(node) =0.04350541
## class counts: 178 31
## probabilities: 0.852 0.148
## left son=4230 (12 obs) right son=4231 (197 obs)
## Primary splits:
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=0.5601729, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.5108985, (0 missing)
## WordCount.root2 < 30.09153 to the right, improve=0.4980706, (0 missing)
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.4241343, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.3390226, (0 missing)
##
## Node number 2118: 19 observations
## predicted class=N expected loss=0.2105263 P(node) =0.003955037
## class counts: 15 4
## probabilities: 0.789 0.211
##
## Node number 2119: 9 observations
## predicted class=Y expected loss=0.4444444 P(node) =0.001873439
## class counts: 4 5
## probabilities: 0.444 0.556
##
## Node number 2126: 25 observations
## predicted class=N expected loss=0.28 P(node) =0.005203997
## class counts: 18 7
## probabilities: 0.720 0.280
##
## Node number 2127: 41 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4878049 P(node) =0.008534555
## class counts: 21 20
## probabilities: 0.512 0.488
## left son=4254 (30 obs) right son=4255 (11 obs)
## Primary splits:
## WordCount.root2 < 36.31791 to the right, improve=0.6635625, (0 missing)
##
## Node number 4230: 12 observations
## predicted class=N expected loss=0 P(node) =0.002497918
## class counts: 12 0
## probabilities: 1.000 0.000
##
## Node number 4231: 197 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1573604 P(node) =0.04100749
## class counts: 166 31
## probabilities: 0.843 0.157
## left son=8462 (11 obs) right son=8463 (186 obs)
## Primary splits:
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.5769882, (0 missing)
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.5314217, (0 missing)
## WordCount.root2 < 30.09153 to the right, improve=0.4682049, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.4106319, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.1814254, (0 missing)
##
## Node number 4254: 30 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4333333 P(node) =0.006244796
## class counts: 17 13
## probabilities: 0.567 0.433
## left son=8508 (7 obs) right son=8509 (23 obs)
## Primary splits:
## WordCount.root2 < 37.14159 to the left, improve=0.3979296, (0 missing)
##
## Node number 4255: 11 observations
## predicted class=Y expected loss=0.3636364 P(node) =0.002289759
## class counts: 4 7
## probabilities: 0.364 0.636
##
## Node number 8462: 11 observations
## predicted class=N expected loss=0 P(node) =0.002289759
## class counts: 11 0
## probabilities: 1.000 0.000
##
## Node number 8463: 186 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1666667 P(node) =0.03871774
## class counts: 155 31
## probabilities: 0.833 0.167
## left son=16926 (29 obs) right son=16927 (157 obs)
## Primary splits:
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.6559045, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.4920635, (0 missing)
## WordCount.root2 < 30.09153 to the right, improve=0.3890196, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.2415584, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=0.0126479, (0 missing)
##
## Node number 8508: 7 observations
## predicted class=N expected loss=0.2857143 P(node) =0.001457119
## class counts: 5 2
## probabilities: 0.714 0.286
##
## Node number 8509: 23 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4782609 P(node) =0.004787677
## class counts: 12 11
## probabilities: 0.522 0.478
## left son=17018 (8 obs) right son=17019 (15 obs)
## Primary splits:
## WordCount.root2 < 38.57459 to the right, improve=0.2615942, (0 missing)
##
## Node number 16926: 29 observations
## predicted class=N expected loss=0.06896552 P(node) =0.006036636
## class counts: 27 2
## probabilities: 0.931 0.069
##
## Node number 16927: 157 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1847134 P(node) =0.0326811
## class counts: 128 29
## probabilities: 0.815 0.185
## left son=33854 (18 obs) right son=33855 (139 obs)
## Primary splits:
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.67831090, (0 missing)
## WordCount.root2 < 32.38827 to the left, improve=0.61044970, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.38816480, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the right, improve=0.01539613, (0 missing)
## Surrogate splits:
## WordCount.root2 < 32.51922 to the right, agree=0.892, adj=0.056, (0 split)
##
## Node number 17018: 8 observations
## predicted class=N expected loss=0.375 P(node) =0.001665279
## class counts: 5 3
## probabilities: 0.625 0.375
##
## Node number 17019: 15 observations
## predicted class=Y expected loss=0.4666667 P(node) =0.003122398
## class counts: 7 8
## probabilities: 0.467 0.533
##
## Node number 33854: 18 observations
## predicted class=N expected loss=0.05555556 P(node) =0.003746878
## class counts: 17 1
## probabilities: 0.944 0.056
##
## Node number 33855: 139 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.2014388 P(node) =0.02893422
## class counts: 111 28
## probabilities: 0.799 0.201
## left son=67710 (102 obs) right son=67711 (37 obs)
## Primary splits:
## WordCount.root2 < 30.09153 to the right, improve=0.9266317, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.5580040, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the right, improve=0.1306354, (0 missing)
##
## Node number 67710: 102 observations
## predicted class=N expected loss=0.1666667 P(node) =0.02123231
## class counts: 85 17
## probabilities: 0.833 0.167
##
## Node number 67711: 37 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.2972973 P(node) =0.007701915
## class counts: 26 11
## probabilities: 0.703 0.297
## left son=135422 (30 obs) right son=135423 (7 obs)
## Primary splits:
## WordCount.root2 < 29.92488 to the left, improve=3.00231700, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=0.01303089, (0 missing)
##
## Node number 135422: 30 observations
## predicted class=N expected loss=0.2 P(node) =0.006244796
## class counts: 24 6
## probabilities: 0.800 0.200
##
## Node number 135423: 7 observations
## predicted class=Y expected loss=0.2857143 P(node) =0.001457119
## class counts: 2 5
## probabilities: 0.286 0.714
##
## n= 4804
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4804 863 N (0.82035803 0.17964197)
## 2) NDSSName.my.fctrOpEd#Opinion#< 0.5 4367 485 N (0.88893978 0.11106022)
## 4) NDSSName.my.fctrBusiness#Crosswords/Games#< 0.5 4262 390 N (0.90849366 0.09150634)
## 8) NDSSName.my.fctrScience#Health#< 0.5 4114 279 N (0.93218279 0.06781721)
## 16) NDSSName.my.fctrStyles#U.S.#< 0.5 3987 191 N (0.95209431 0.04790569)
## 32) WordCount.root2< 25.01 2982 38 N (0.98725687 0.01274313) *
## 33) WordCount.root2>=25.01 1005 153 N (0.84776119 0.15223881)
## 66) NDSSName.my.fctr#Opinion#ThePublicEditor< 0.5 993 142 N (0.85699899 0.14300101)
## 132) NDSSName.my.fctrCulture#Arts#< 0.5 930 122 N (0.86881720 0.13118280)
## 264) WordCount.root2< 33.97057 627 65 N (0.89633174 0.10366826)
## 528) NDSSName.my.fctrBusiness#Technology#< 0.5 561 49 N (0.91265597 0.08734403)
## 1056) WordCount.root2< 29.33428 281 14 N (0.95017794 0.04982206) *
## 1057) WordCount.root2>=29.33428 280 35 N (0.87500000 0.12500000)
## 2114) WordCount.root2>=32.57299 71 4 N (0.94366197 0.05633803) *
## 2115) WordCount.root2< 32.57299 209 31 N (0.85167464 0.14832536)
## 4230) NDSSName.my.fctrTStyle##>=0.5 12 0 N (1.00000000 0.00000000) *
## 4231) NDSSName.my.fctrTStyle##< 0.5 197 31 N (0.84263959 0.15736041)
## 8462) NDSSName.my.fctr#Multimedia#>=0.5 11 0 N (1.00000000 0.00000000) *
## 8463) NDSSName.my.fctr#Multimedia#< 0.5 186 31 N (0.83333333 0.16666667)
## 16926) NDSSName.my.fctrMetro#N.Y./Region#>=0.5 29 2 N (0.93103448 0.06896552) *
## 16927) NDSSName.my.fctrMetro#N.Y./Region#< 0.5 157 29 N (0.81528662 0.18471338)
## 33854) NDSSName.my.fctrForeign#World#AsiaPacific>=0.5 18 1 N (0.94444444 0.05555556) *
## 33855) NDSSName.my.fctrForeign#World#AsiaPacific< 0.5 139 28 N (0.79856115 0.20143885)
## 67710) WordCount.root2>=30.09153 102 17 N (0.83333333 0.16666667) *
## 67711) WordCount.root2< 30.09153 37 11 N (0.70270270 0.29729730)
## 135422) WordCount.root2< 29.92488 30 6 N (0.80000000 0.20000000) *
## 135423) WordCount.root2>=29.92488 7 2 Y (0.28571429 0.71428571) *
## 529) NDSSName.my.fctrBusiness#Technology#>=0.5 66 16 N (0.75757576 0.24242424)
## 1058) WordCount.root2< 27.86575 38 7 N (0.81578947 0.18421053) *
## 1059) WordCount.root2>=27.86575 28 9 N (0.67857143 0.32142857)
## 2118) WordCount.root2>=28.6269 19 4 N (0.78947368 0.21052632) *
## 2119) WordCount.root2< 28.6269 9 4 Y (0.44444444 0.55555556) *
## 265) WordCount.root2>=33.97057 303 57 N (0.81188119 0.18811881)
## 530) NDSSName.my.fctrBusiness#BusinessDay#Dealbook< 0.5 222 29 N (0.86936937 0.13063063) *
## 531) NDSSName.my.fctrBusiness#BusinessDay#Dealbook>=0.5 81 28 N (0.65432099 0.34567901)
## 1062) WordCount.root2>=41.59766 15 1 N (0.93333333 0.06666667) *
## 1063) WordCount.root2< 41.59766 66 27 N (0.59090909 0.40909091)
## 2126) WordCount.root2< 35.6581 25 7 N (0.72000000 0.28000000) *
## 2127) WordCount.root2>=35.6581 41 20 N (0.51219512 0.48780488)
## 4254) WordCount.root2>=36.31791 30 13 N (0.56666667 0.43333333)
## 8508) WordCount.root2< 37.14159 7 2 N (0.71428571 0.28571429) *
## 8509) WordCount.root2>=37.14159 23 11 N (0.52173913 0.47826087)
## 17018) WordCount.root2>=38.57459 8 3 N (0.62500000 0.37500000) *
## 17019) WordCount.root2< 38.57459 15 7 Y (0.46666667 0.53333333) *
## 4255) WordCount.root2< 36.31791 11 4 Y (0.36363636 0.63636364) *
## 133) NDSSName.my.fctrCulture#Arts#>=0.5 63 20 N (0.68253968 0.31746032)
## 266) WordCount.root2< 26.99984 14 3 N (0.78571429 0.21428571) *
## 267) WordCount.root2>=26.99984 49 17 N (0.65306122 0.34693878)
## 534) WordCount.root2>=41.56249 10 2 N (0.80000000 0.20000000) *
## 535) WordCount.root2< 41.56249 39 15 N (0.61538462 0.38461538)
## 1070) WordCount.root2< 34.23387 32 11 N (0.65625000 0.34375000) *
## 1071) WordCount.root2>=34.23387 7 3 Y (0.42857143 0.57142857) *
## 67) NDSSName.my.fctr#Opinion#ThePublicEditor>=0.5 12 1 Y (0.08333333 0.91666667) *
## 17) NDSSName.my.fctrStyles#U.S.#>=0.5 127 39 Y (0.30708661 0.69291339)
## 34) WordCount.root2< 15.32846 13 5 N (0.61538462 0.38461538) *
## 35) WordCount.root2>=15.32846 114 31 Y (0.27192982 0.72807018)
## 70) WordCount.root2< 29.21444 79 25 Y (0.31645570 0.68354430)
## 140) WordCount.root2>=27.36786 25 10 Y (0.40000000 0.60000000)
## 280) WordCount.root2< 28.02674 8 3 N (0.62500000 0.37500000) *
## 281) WordCount.root2>=28.02674 17 5 Y (0.29411765 0.70588235) *
## 141) WordCount.root2< 27.36786 54 15 Y (0.27777778 0.72222222)
## 282) WordCount.root2< 26.55173 45 14 Y (0.31111111 0.68888889)
## 564) WordCount.root2>=21.70252 23 9 Y (0.39130435 0.60869565)
## 1128) WordCount.root2< 23.6326 7 3 N (0.57142857 0.42857143) *
## 1129) WordCount.root2>=23.6326 16 5 Y (0.31250000 0.68750000) *
## 565) WordCount.root2< 21.70252 22 5 Y (0.22727273 0.77272727) *
## 283) WordCount.root2>=26.55173 9 1 Y (0.11111111 0.88888889) *
## 71) WordCount.root2>=29.21444 35 6 Y (0.17142857 0.82857143) *
## 9) NDSSName.my.fctrScience#Health#>=0.5 148 37 Y (0.25000000 0.75000000)
## 18) WordCount.root2< 22.72663 55 23 N (0.58181818 0.41818182)
## 36) WordCount.root2>=19.93708 9 2 N (0.77777778 0.22222222) *
## 37) WordCount.root2< 19.93708 46 21 N (0.54347826 0.45652174)
## 74) WordCount.root2< 17.01454 36 14 N (0.61111111 0.38888889)
## 148) WordCount.root2>=15.74773 8 2 N (0.75000000 0.25000000) *
## 149) WordCount.root2< 15.74773 28 12 N (0.57142857 0.42857143)
## 298) WordCount.root2< 15.06648 20 7 N (0.65000000 0.35000000) *
## 299) WordCount.root2>=15.06648 8 3 Y (0.37500000 0.62500000) *
## 75) WordCount.root2>=17.01454 10 3 Y (0.30000000 0.70000000) *
## 19) WordCount.root2>=22.72663 93 5 Y (0.05376344 0.94623656) *
## 5) NDSSName.my.fctrBusiness#Crosswords/Games#>=0.5 105 10 Y (0.09523810 0.90476190)
## 10) WordCount.root2< 18.9043 12 5 N (0.58333333 0.41666667) *
## 11) WordCount.root2>=18.9043 93 3 Y (0.03225806 0.96774194) *
## 3) NDSSName.my.fctrOpEd#Opinion#>=0.5 437 59 Y (0.13501144 0.86498856) *
## Prediction
## Reference N Y
## N 3814 127
## Y 170 693
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.381765e-01 7.860827e-01 9.309917e-01 9.448229e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 2.798570e-127 1.480611e-02
## Prediction
## Reference N Y
## N 1180 318
## Y 84 146
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.673611e-01 2.953321e-01 7.467059e-01 7.871043e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.224022e-31
## id feats
## 1 Max.cor.Y.rcv.1X1.cp.0###rpart WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 0.969 0.076
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8821543 0.9705658 0.7937428 0.9504198
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8235294 0.9381765
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9309917 0.9448229 0.7860827
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.6174697 0.9218959 0.3130435 0.7773858
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4207493 0.7673611
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7467059 0.7871043 0.2953321
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
# if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = FALSE,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="rpart")),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Max.cor.Y##rcv#rpart"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## + Fold1.Rep1: cp=0.01043
## - Fold1.Rep1: cp=0.01043
## + Fold2.Rep1: cp=0.01043
## - Fold2.Rep1: cp=0.01043
## + Fold3.Rep1: cp=0.01043
## - Fold3.Rep1: cp=0.01043
## + Fold1.Rep2: cp=0.01043
## - Fold1.Rep2: cp=0.01043
## + Fold2.Rep2: cp=0.01043
## - Fold2.Rep2: cp=0.01043
## + Fold3.Rep2: cp=0.01043
## - Fold3.Rep2: cp=0.01043
## + Fold1.Rep3: cp=0.01043
## - Fold1.Rep3: cp=0.01043
## + Fold2.Rep3: cp=0.01043
## - Fold2.Rep3: cp=0.01043
## + Fold3.Rep3: cp=0.01043
## - Fold3.Rep3: cp=0.01043
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0104 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y", : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4804
##
## CP nsplit rel error
## 1 0.36964079 0 1.0000000
## 2 0.09849363 1 0.6303592
## 3 0.08574739 2 0.5318656
## 4 0.05677868 3 0.4461182
## 5 0.01042874 4 0.3893395
##
## Variable importance
## NDSSName.my.fctrOpEd#Opinion#
## 55
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 16
## NDSSName.my.fctrScience#Health#
## 16
## NDSSName.my.fctrStyles#U.S.#
## 12
##
## Node number 1: 4804 observations, complexity param=0.3696408
## predicted class=N expected loss=0.179642 P(node) =1
## class counts: 3941 863
## probabilities: 0.820 0.180
## left son=2 (4367 obs) right son=3 (437 obs)
## Primary splits:
## NDSSName.my.fctrOpEd#Opinion# < 0.5 to the left, improve=451.59770, (0 missing)
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=112.88510, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve=111.17610, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve= 99.35206, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 68.73272, (0 missing)
##
## Node number 2: 4367 observations, complexity param=0.09849363
## predicted class=N expected loss=0.1110602 P(node) =0.9090341
## class counts: 3882 485
## probabilities: 0.889 0.111
## left son=4 (4262 obs) right son=5 (105 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=135.55130, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=125.07920, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve= 94.70710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 88.56821, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 18.74400, (0 missing)
##
## Node number 3: 437 observations
## predicted class=Y expected loss=0.1350114 P(node) =0.09096586
## class counts: 59 378
## probabilities: 0.135 0.865
##
## Node number 4: 4262 observations, complexity param=0.08574739
## predicted class=N expected loss=0.09150634 P(node) =0.8871774
## class counts: 3872 390
## probabilities: 0.908 0.092
## left son=8 (4114 obs) right son=9 (148 obs)
## Primary splits:
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=132.96710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 94.69099, (0 missing)
## WordCount.root2 < 26.49528 to the left, improve= 84.07487, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 19.71762, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 10.17000, (0 missing)
##
## Node number 5: 105 observations
## predicted class=Y expected loss=0.0952381 P(node) =0.02185679
## class counts: 10 95
## probabilities: 0.095 0.905
##
## Node number 8: 4114 observations, complexity param=0.05677868
## predicted class=N expected loss=0.06781721 P(node) =0.8563697
## class counts: 3835 279
## probabilities: 0.932 0.068
## left son=16 (3987 obs) right son=17 (127 obs)
## Primary splits:
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve=102.410700, (0 missing)
## WordCount.root2 < 25.01 to the left, improve= 47.352210, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 20.930810, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 5.249425, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve= 2.395935, (0 missing)
##
## Node number 9: 148 observations
## predicted class=Y expected loss=0.25 P(node) =0.03080766
## class counts: 37 111
## probabilities: 0.250 0.750
##
## Node number 16: 3987 observations
## predicted class=N expected loss=0.04790569 P(node) =0.8299334
## class counts: 3796 191
## probabilities: 0.952 0.048
##
## Node number 17: 127 observations
## predicted class=Y expected loss=0.3070866 P(node) =0.0264363
## class counts: 39 88
## probabilities: 0.307 0.693
##
## n= 4804
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4804 863 N (0.82035803 0.17964197)
## 2) NDSSName.my.fctrOpEd#Opinion#< 0.5 4367 485 N (0.88893978 0.11106022)
## 4) NDSSName.my.fctrBusiness#Crosswords/Games#< 0.5 4262 390 N (0.90849366 0.09150634)
## 8) NDSSName.my.fctrScience#Health#< 0.5 4114 279 N (0.93218279 0.06781721)
## 16) NDSSName.my.fctrStyles#U.S.#< 0.5 3987 191 N (0.95209431 0.04790569) *
## 17) NDSSName.my.fctrStyles#U.S.#>=0.5 127 39 Y (0.30708661 0.69291339) *
## 9) NDSSName.my.fctrScience#Health#>=0.5 148 37 Y (0.25000000 0.75000000) *
## 5) NDSSName.my.fctrBusiness#Crosswords/Games#>=0.5 105 10 Y (0.09523810 0.90476190) *
## 3) NDSSName.my.fctrOpEd#Opinion#>=0.5 437 59 Y (0.13501144 0.86498856) *
## Prediction
## Reference N Y
## N 3796 145
## Y 191 672
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.300583e-01 7.576571e-01 9.224771e-01 9.371115e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 4.458834e-108 1.409037e-02
## Prediction
## Reference N Y
## N 1355 143
## Y 168 62
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.8200231 0.1825002 0.8010821 0.8378705 0.8668981
## AccuracyPValue McnemarPValue
## 1.0000000 0.1735405
## id feats max.nTuningRuns
## 1 Max.cor.Y##rcv#rpart WordCount.root2,NDSSName.my.fctr 5
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 3.421 0.074 0.8709432
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.9632073 0.778679 0.8746354 0.6
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.8 0.9296422 0.9224771
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.9371115 0.7515134 0.5870523 0.9045394
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.2695652 0.5892132 0.6 0.2850575
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.8200231 0.8010821 0.8378705
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.1825002 0.00506952 0.0191091
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Poly"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Poly",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## label step_major step_minor label_minor
## 5 fit.models_0_Max.cor.Y[rcv.1X1.cp.0|] 1 4 rpart
## 6 fit.models_0_Max.cor.Y.Time.Poly 1 5 glmnet
## bgn end elapsed
## 5 150.482 165.178 14.696
## 6 165.179 NA NA
## [1] "fitting model: Max.cor.Y.Time.Poly##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0201 on full training set
## Length Class Mode
## a0 100 -none- numeric
## beta 13100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 131 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.80640970
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.32799873
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.71895558
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.69928283
## NDSSName.my.fctrOpEd#Opinion#
## 4.00043819
## NDSSName.my.fctrScience#Health#
## 3.10750415
## NDSSName.my.fctrStyles#U.S.#
## 2.85717557
## NDSSName.my.fctrTStyle##
## -0.14606302
## PubDate.day.minutes.poly.1
## 5.57604425
## WordCount.root2
## 0.05218571
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 0.75372185
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.88967393
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.48865614
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.84279682
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.76936872
## NDSSName.my.fctrBusiness#Technology#
## 0.09968733
## NDSSName.my.fctrOpEd#Opinion#
## 4.06768787
## NDSSName.my.fctrScience#Health#
## 3.17511582
## NDSSName.my.fctrStyles#U.S.#
## 2.92206821
## NDSSName.my.fctrTStyle##
## -0.20202194
## PubDate.day.minutes.poly.1
## 7.01701251
## WordCount.root2
## 0.05458241
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 3.21496869
## Prediction
## Reference N Y
## N 3797 144
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.331807e-01 7.698585e-01 9.257484e-01 9.400811e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 3.361187e-115 7.408860e-02
## Prediction
## Reference N Y
## N 1200 298
## Y 90 140
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.754630e-01 2.963401e-01 7.550404e-01 7.949457e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 7.866103e-26
## id
## 1 Max.cor.Y.Time.Poly##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 12.606 1.702
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8734975 0.9659985 0.7809965 0.9534659
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8103957 0.931932
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9257484 0.9400811 0.762829
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5950717 0.9118825 0.2782609 0.7997373
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4191617 0.775463
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7550404 0.7949457 0.2963401
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005321823 0.01901796
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Lag"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Lag",
type = glb_model_type,
tune.df = glmnet_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## label step_major step_minor label_minor
## 6 fit.models_0_Max.cor.Y.Time.Poly 1 5 glmnet
## 7 fit.models_0_Max.cor.Y.Time.Lag 1 6 glmnet
## bgn end elapsed
## 6 165.179 184.992 19.813
## 7 184.992 NA NA
## [1] "fitting model: Max.cor.Y.Time.Lag##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0934 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 13100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 131 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.3346728432
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.0700552892
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 0.6960494506
## NDSSName.my.fctr#U.S.#Education
## -0.0690824805
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 0.6932362214
## NDSSName.my.fctrBusiness#Technology#
## 0.0213652189
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.0400198990
## NDSSName.my.fctrOpEd#Opinion#
## 0.7869504896
## NDSSName.my.fctrScience#Health#
## 0.8481886152
## NDSSName.my.fctrStyles##Fashion
## -0.0328362133
## NDSSName.my.fctrStyles#U.S.#
## 0.6561242230
## NDSSName.my.fctrTStyle##
## -0.0981051454
## PubDate.last2.log1p
## 0.0092548859
## PubDate.last4.log1p
## 0.0102871187
## PubDate.last8.log1p
## 0.0024984935
## WordCount.root2
## 0.0301342902
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -0.0035122093
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 0.0433276050
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -0.0041927390
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 0.0484243489
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -0.0028368765
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 0.0461504248
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 0.0309830777
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -0.0006575310
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 0.0239742568
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -0.0065933198
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -0.0056903861
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 0.0314132550
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -0.0062595339
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 0.0683177942
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 0.0089001176
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -0.0039023477
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 0.0677459270
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 0.0530838737
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -0.0015560741
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 0.0595010497
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -0.0094780945
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -0.0036496827
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 0.0438906916
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -0.0028934662
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 0.0420188172
## NDSSName.my.fctrBusiness#Technology#:PubDate.last32.log1p.ctg
## 0.0001053778
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last32.log1p.ctg
## -0.0002655336
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 0.0399526463
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 0.0309010857
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 0.0211726295
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -0.0055085967
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -0.0054198705
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 0.0332806264
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -0.0058751936
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 0.0637336391
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 0.0052011593
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -0.0038587969
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 0.0596874074
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 0.0396819084
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -0.0013725654
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 0.0532860460
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -0.0082041791
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -0.0051487144
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 0.0400352897
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -0.0050160685
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 0.0486360490
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 0.0032553238
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -0.0037797318
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 0.0530103663
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 0.0378007768
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -0.0013683257
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 0.0359367759
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -0.0076052662
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.4741443935
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.0913108705
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 0.7302582708
## NDSSName.my.fctr#U.S.#Education
## -0.0799432875
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 0.7098915737
## NDSSName.my.fctrBusiness#Technology#
## 0.0313863839
## NDSSName.my.fctrForeign#World#
## -0.0169095302
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.0511873031
## NDSSName.my.fctrOpEd#Opinion#
## 0.8108543356
## NDSSName.my.fctrScience#Health#
## 0.8885013044
## NDSSName.my.fctrStyles##Fashion
## -0.0470453548
## NDSSName.my.fctrStyles#U.S.#
## 0.6815476676
## NDSSName.my.fctrTStyle##
## -0.1056040103
## PubDate.last2.log1p
## 0.0119991350
## PubDate.last4.log1p
## 0.0133329085
## PubDate.last8.log1p
## 0.0064217053
## WordCount.root2
## 0.0320610100
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -0.0050988188
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 0.0448997743
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -0.0050084105
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 0.0491525599
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -0.0037475441
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 0.0465297477
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 0.0307757995
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -0.0017151206
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 0.0234485995
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -0.0071212577
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -0.0077599233
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 0.0315315004
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -0.0073500972
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 0.0699029286
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 0.0104997991
## NDSSName.my.fctrForeign#World#:PubDate.last2.log1p.ctg
## -0.0002079675
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -0.0050496337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 0.0690992956
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 0.0541552574
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -0.0029060086
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 0.0614899139
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -0.0102372000
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -0.0051820958
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 0.0456360744
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -0.0036536131
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 0.0424580432
## NDSSName.my.fctrBusiness#Technology#:PubDate.last32.log1p.ctg
## 0.0007660277
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last32.log1p.ctg
## -0.0010335475
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 0.0400154206
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 0.0309238460
## NDSSName.my.fctrStyles##Fashion:PubDate.last32.log1p.ctg
## -0.0002014750
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 0.0205882818
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -0.0059487936
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -0.0073308225
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 0.0336573160
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -0.0068682427
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 0.0652303875
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 0.0064111433
## NDSSName.my.fctrForeign#World#:PubDate.last4.log1p.ctg
## -0.0009097510
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -0.0049209363
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 0.0606305304
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 0.0397307372
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -0.0025762211
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 0.0548717296
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -0.0088553088
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -0.0069262052
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 0.0411687878
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -0.0058924220
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 0.0491118878
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 0.0042430248
## NDSSName.my.fctrForeign#World#:PubDate.last8.log1p.ctg
## -0.0006436969
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -0.0047925039
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 0.0537292903
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 0.0379358610
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -0.0024919231
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 0.0362319863
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -0.0082143001
## Prediction
## Reference N Y
## N 3791 150
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.319317e-01 7.662012e-01 9.244394e-01 9.388938e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 2.593356e-112 1.504899e-01
## Prediction
## Reference N Y
## N 1219 279
## Y 98 132
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.818287e-01 2.908072e-01 7.615963e-01 8.010994e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.853246e-20
## id
## 1 Max.cor.Y.Time.Lag##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 39.6 2.782
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8621088 0.9652372 0.7589803 0.9558908
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.8075338 0.9279769
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9244394 0.9388938 0.7473218
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5927265 0.9158879 0.2695652 0.8024758
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4118565 0.7818287
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7615963 0.8010994 0.2908072
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.004678817 0.01649089
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(setdiff(unique(glb_feats_df$cor.high.X), NA),
subset(glb_feats_df, nzv)$id)) > 0) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Interact.High.cor.Y"), major.inc = FALSE,
label.minor = "glmnet")
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Interact.High.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":")),
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
## label step_major step_minor label_minor
## 7 fit.models_0_Max.cor.Y.Time.Lag 1 6 glmnet
## 8 fit.models_0_Interact.High.cor.Y 1 7 glmnet
## bgn end elapsed
## 7 184.992 231.726 46.734
## 8 231.726 NA NA
## [1] "fitting model: Interact.High.cor.Y##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.000934 on full training set
## Length Class Mode
## a0 92 -none- numeric
## beta 2392 dgCMatrix S4
## df 92 -none- numeric
## dim 2 -none- numeric
## lambda 92 -none- numeric
## dev.ratio 92 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 26 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -4.8684452161
## NDSSName.my.fctr#Multimedia#
## -1.0487925341
## NDSSName.my.fctr#Opinion#RoomForDebate
## -5.4745572239
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.2885428864
## NDSSName.my.fctr#U.S.#Education
## -2.8325574053
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.3151450444
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.7572259115
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.3391439837
## NDSSName.my.fctrBusiness#Technology#
## 0.8045020529
## NDSSName.my.fctrCulture#Arts#
## -0.3229201625
## NDSSName.my.fctrForeign#World#
## -1.8392467451
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.8077498032
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.2627263003
## NDSSName.my.fctrOpEd#Opinion#
## 4.7415088342
## NDSSName.my.fctrScience#Health#
## 3.7483695286
## NDSSName.my.fctrStyles##Fashion
## -2.4194040890
## NDSSName.my.fctrStyles#U.S.#
## 3.4229733393
## NDSSName.my.fctrTStyle##
## -2.0211781889
## NDSSName.my.fctrTravel#Travel#
## -1.7751305343
## NDSSName.my.fctrmyOther
## -2.3074809466
## WordCount.root2
## 0.0361357822
## WordCount.root2:PubDate.day.minutes.poly.1
## 1.0813556024
## WordCount.root2:PubDate.last4.log1p
## 0.0069832077
## WordCount.root2:PubDate.month.fctr10
## 0.0039570640
## WordCount.root2:PubDate.month.fctr11
## -0.0001256918
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -4.8709915569
## NDSSName.my.fctr#Multimedia#
## -1.0849702043
## NDSSName.my.fctr#Opinion#RoomForDebate
## -5.5929746399
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.2928665989
## NDSSName.my.fctr#U.S.#Education
## -2.9379189459
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.3276698231
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.7772032984
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.3366629069
## NDSSName.my.fctrBusiness#Technology#
## 0.8004536454
## NDSSName.my.fctrCulture#Arts#
## -0.3372244058
## NDSSName.my.fctrForeign#World#
## -1.9323067428
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.8410299268
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.2641549470
## NDSSName.my.fctrOpEd#Opinion#
## 4.7402435031
## NDSSName.my.fctrScience#Health#
## 3.7456718341
## NDSSName.my.fctrStyles##Fashion
## -2.5160313263
## NDSSName.my.fctrStyles#U.S.#
## 3.4196115772
## NDSSName.my.fctrTStyle##
## -2.0543357159
## NDSSName.my.fctrTravel#Travel#
## -1.8687591112
## NDSSName.my.fctrmyOther
## -2.4107173389
## WordCount.root2
## 0.0361773869
## WordCount.root2:PubDate.day.minutes.poly.1
## 1.0887365081
## WordCount.root2:PubDate.last4.log1p
## 0.0070269546
## WordCount.root2:PubDate.month.fctr10
## 0.0039706413
## WordCount.root2:PubDate.month.fctr11
## -0.0002339144
## Prediction
## Reference N Y
## N 3787 154
## Y 173 690
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.319317e-01 7.670549e-01 9.244394e-01 9.388938e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 2.593356e-112 3.195407e-01
## Prediction
## Reference N Y
## N 1164 334
## Y 71 159
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.656250e-01 3.156027e-01 7.449213e-01 7.854227e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.555643e-39
## id
## 1 Interact.High.cor.Y##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 5.072 0.322
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8776419 0.9626998 0.792584 0.9625372
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8084359 0.931585
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9244394 0.9388938 0.764104
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.6009259 0.9105474 0.2913043 0.8140971
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.439834 0.765625
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7449213 0.7854227 0.3156027
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005250654 0.01810996
# Low.cor.X
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Low.cor.X"), major.inc = FALSE,
label.minor = "glmnet")
## label step_major step_minor label_minor
## 8 fit.models_0_Interact.High.cor.Y 1 7 glmnet
## 9 fit.models_0_Low.cor.X 1 8 glmnet
## bgn end elapsed
## 8 231.726 243.47 11.744
## 9 243.470 NA NA
indep_vars <- subset(glb_feats_df, is.na(cor.high.X) & !nzv &
(exclude.as.feat != 1))[, "id"]
indep_vars <- myadjust_interaction_feats(indep_vars)
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Low.cor.X",
type=glb_model_type,
tune.df = glmnet_tune_models_df,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=indep_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Low.cor.X##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0934 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 26100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 261 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.202862e+00
## NDSSName.my.fctr#Opinion#RoomForDebate
## -8.521086e-02
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 6.880206e-01
## NDSSName.my.fctr#U.S.#Education
## -5.784059e-02
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 6.701988e-01
## NDSSName.my.fctrBusiness#Technology#
## 2.290481e-02
## NDSSName.my.fctrForeign#World#AsiaPacific
## -3.274668e-02
## NDSSName.my.fctrOpEd#Opinion#
## 7.842207e-01
## NDSSName.my.fctrScience#Health#
## 8.362226e-01
## NDSSName.my.fctrStyles##Fashion
## -3.079107e-02
## NDSSName.my.fctrStyles#U.S.#
## 6.586231e-01
## NDSSName.my.fctrTStyle##
## -9.956225e-02
## PubDate.day.minutes.poly.1
## 6.452334e+00
## PubDate.day.minutes.poly.2
## 3.185151e+00
## PubDate.day.minutes.poly.4
## 8.173930e-01
## PubDate.last4.log1p
## 4.098268e-03
## PubDate.wkday.fctr5
## -2.275236e-05
## PubDate.wkend
## 1.052728e-01
## WordCount.root2
## 2.981460e-02
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 1.144109e+00
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -4.058241e+00
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg
## 1.435473e+00
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 4.534495e+00
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 4.741264e-01
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 1.162763e-01
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 1.806309e+00
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -4.705401e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 4.304598e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -3.317348e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 4.698895e-02
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -2.205370e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 4.593643e-02
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 3.155326e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -4.428299e-04
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 2.398555e-02
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -6.710680e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -7.172957e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 3.276090e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -5.154022e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 6.583764e-02
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 8.924455e-03
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -3.157626e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 6.742134e-02
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 5.375306e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -1.106633e-03
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 5.997036e-02
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -9.591957e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -4.820085e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 4.357947e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -2.077560e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 4.049781e-02
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 3.962558e-02
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 3.048215e-02
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 2.113852e-02
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -5.645530e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -6.756004e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 3.427222e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -4.871129e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 6.144420e-02
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 5.221287e-03
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -3.128905e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 5.939321e-02
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 4.046602e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -9.613579e-04
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 5.354044e-02
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -8.318684e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -6.377549e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 4.012450e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -4.056166e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 4.714690e-02
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 3.372173e-03
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -3.104981e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 5.275033e-02
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 3.856981e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -1.044166e-03
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 3.610233e-02
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -7.703998e-03
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.2917372206
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.1077991963
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 0.7145162057
## NDSSName.my.fctr#U.S.#Education
## -0.0675153576
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 0.6847086604
## NDSSName.my.fctrBusiness#Technology#
## 0.0338425465
## NDSSName.my.fctrForeign#World#
## -0.0056510270
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.0431336996
## NDSSName.my.fctrOpEd#Opinion#
## 0.8080156260
## NDSSName.my.fctrScience#Health#
## 0.8720969876
## NDSSName.my.fctrStyles##Fashion
## -0.0449467556
## NDSSName.my.fctrStyles#U.S.#
## 0.6848458465
## NDSSName.my.fctrTStyle##
## -0.1068542341
## PubDate.day.minutes.poly.1
## 6.9218320529
## PubDate.day.minutes.poly.2
## 3.7125649527
## PubDate.day.minutes.poly.4
## 1.0946423451
## PubDate.last4.log1p
## 0.0074154519
## PubDate.wkday.fctr5
## -0.0099640217
## PubDate.wkend
## 0.1191625256
## WordCount.root2
## 0.0317225870
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 1.4076395302
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -5.5235755281
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg
## 1.6550615516
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 5.3180890769
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 0.7422606550
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 0.4015516661
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg
## 1.2519607267
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg
## 0.0151914873
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 2.7388433211
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -0.0063971273
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 0.0444497010
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -0.0040421728
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 0.0476287025
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -0.0030456278
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 0.0463442551
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 0.0314477085
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -0.0014930601
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 0.0234751275
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -0.0072231460
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -0.0093789969
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 0.0336400023
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -0.0061272849
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 0.0671838538
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 0.0105210601
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -0.0042188810
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 0.0687814614
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 0.0550257721
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -0.0024258118
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 0.0620565904
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -0.0103290150
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -0.0064579863
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 0.0450789075
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -0.0027461150
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 0.0408247292
## NDSSName.my.fctrBusiness#Technology#:PubDate.last32.log1p.ctg
## 0.0003621433
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last32.log1p.ctg
## -0.0005633660
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 0.0397403693
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 0.0304028028
## NDSSName.my.fctrStyles##Fashion:PubDate.last32.log1p.ctg
## -0.0001455953
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 0.0205734403
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -0.0060725179
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -0.0087853392
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 0.0351829015
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -0.0057617515
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 0.0627276287
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 0.0064552363
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -0.0041041644
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 0.0603096891
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 0.0407244990
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -0.0021050459
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 0.0551911420
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -0.0089469436
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -0.0082606820
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 0.0412828578
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -0.0048320742
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 0.0475545601
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 0.0044231483
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -0.0040431528
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 0.0534726687
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 0.0389034575
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -0.0021193073
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 0.0364380868
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -0.0082891522
## Prediction
## Reference N Y
## N 3787 154
## Y 174 689
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.317236e-01 7.662359e-01 9.242213e-01 9.386958e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 7.764099e-112 2.941323e-01
## Prediction
## Reference N Y
## N 1209 289
## Y 94 136
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.783565e-01 2.931798e-01 7.580195e-01 7.977437e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.657340e-23
## id
## 1 Low.cor.X##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 76.732 4.95
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8624894 0.9659985 0.7589803 0.958864
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.8077374 0.9276303
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9242213 0.9386958 0.7453708
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5917252 0.9138852 0.2695652 0.8052766
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4152672 0.7783565
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7580195 0.7977437 0.2931798
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.004942454 0.01832711
fit.models_0_chunk_df <-
myadd_chunk(fit.models_0_chunk_df, "fit.models_0_end", major.inc = FALSE,
label.minor = "teardown")
## label step_major step_minor label_minor bgn
## 9 fit.models_0_Low.cor.X 1 8 glmnet 243.470
## 10 fit.models_0_end 1 9 teardown 328.355
## end elapsed
## 9 328.355 84.885
## 10 NA NA
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 10 fit.models 6 0 0 63.608 328.369 264.762
## 11 fit.models 6 1 1 328.370 NA NA
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn", label.minor="setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 setup 342.944 NA NA
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
topindep_var <- NULL; interact_vars <- NULL;
for (mdl_id_pfx in names(glb_mdl_family_lst)) {
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, paste0("fit.models_1_", mdl_id_pfx),
major.inc = FALSE, label.minor = "setup")
indep_vars <- NULL;
if (grepl("\\.Interact", mdl_id_pfx)) {
if (is.null(topindep_var) && is.null(interact_vars)) {
# select best glmnet model upto now
dsp_models_df <- orderBy(model_sel_frmla <- get_model_sel_frmla(),
glb_models_df)
dsp_models_df <- subset(dsp_models_df,
grepl(".glmnet", id, fixed = TRUE))
bst_mdl_id <- dsp_models_df$id[1]
mdl_id_pfx <-
paste(c(head(unlist(strsplit(bst_mdl_id, "[.]")), -1), "Interact"),
collapse=".")
# select important features
if (is.null(bst_featsimp_df <-
myget_feats_importance(glb_models_lst[[bst_mdl_id]]))) {
warning("Base model for RFE.Interact: ", bst_mdl_id,
" has no important features")
next
}
topindep_ix <- 1
while (is.null(topindep_var) && (topindep_ix <= nrow(bst_featsimp_df))) {
topindep_var <- row.names(bst_featsimp_df)[topindep_ix]
if (grepl(".fctr", topindep_var, fixed=TRUE))
topindep_var <-
paste0(unlist(strsplit(topindep_var, ".fctr"))[1], ".fctr")
if (topindep_var %in% names(glbFeatsInteractionOnly)) {
topindep_var <- NULL; topindep_ix <- topindep_ix + 1
} else break
}
# select features with importance > max(10, importance of .rnorm) & is not highest
# combine factor dummy features to just the factor feature
if (length(pos_rnorm <-
grep(".rnorm", row.names(bst_featsimp_df), fixed=TRUE)) > 0)
imp_rnorm <- bst_featsimp_df[pos_rnorm, 1] else
imp_rnorm <- NA
imp_cutoff <- max(10, imp_rnorm, na.rm=TRUE)
interact_vars <-
tail(row.names(subset(bst_featsimp_df,
imp > imp_cutoff)), -1)
if (length(interact_vars) > 0) {
interact_vars <-
myadjust_interaction_feats(myextract_actual_feats(interact_vars))
interact_vars <-
interact_vars[!grepl(topindep_var, interact_vars, fixed=TRUE)]
}
### bid0_sp only
# interact_vars <- c(
# "biddable", "D.ratio.sum.TfIdf.wrds.n", "D.TfIdf.sum.stem.stop.Ratio", "D.sum.TfIdf",
# "D.TfIdf.sum.post.stop", "D.TfIdf.sum.post.stem", "D.ratio.wrds.stop.n.wrds.n", "D.chrs.uppr.n.log",
# "D.chrs.n.log", "color.fctr"
# # , "condition.fctr", "prdl.my.descr.fctr"
# )
# interact_vars <- setdiff(interact_vars, c("startprice.dgt2.is9", "color.fctr"))
###
indep_vars <- myextract_actual_feats(row.names(bst_featsimp_df))
indep_vars <- setdiff(indep_vars, topindep_var)
if (length(interact_vars) > 0) {
indep_vars <-
setdiff(indep_vars, myextract_actual_feats(interact_vars))
indep_vars <- c(indep_vars,
paste(topindep_var, setdiff(interact_vars, topindep_var),
sep = "*"))
} else indep_vars <- union(indep_vars, topindep_var)
}
}
if (is.null(indep_vars))
indep_vars <- glb_mdl_feats_lst[[mdl_id_pfx]]
if (is.null(indep_vars) && grepl("RFE\\.", mdl_id_pfx))
indep_vars <- myextract_actual_feats(predictors(rfe_fit_results))
if (is.null(indep_vars))
indep_vars <- subset(glb_feats_df, !nzv & (exclude.as.feat != 1))[, "id"]
if ((length(indep_vars) == 1) && (grepl("^%<d-%", indep_vars))) {
indep_vars <-
eval(parse(text = str_trim(unlist(strsplit(indep_vars, "%<d-%"))[2])))
}
indep_vars <- myadjust_interaction_feats(indep_vars)
if (grepl("\\.Interact", mdl_id_pfx)) {
# if (method != tail(unlist(strsplit(bst_mdl_id, "[.]")), 1)) next
if (is.null(glb_mdl_family_lst[[mdl_id_pfx]])) {
if (!is.null(glb_mdl_family_lst[["Best.Interact"]]))
glb_mdl_family_lst[[mdl_id_pfx]] <-
glb_mdl_family_lst[["Best.Interact"]]
}
}
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glb_id_var] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
} else fitobs_df <- glbObsFit
if (is.null(glb_mdl_family_lst[[mdl_id_pfx]]))
mdl_methods <- glbMdlMethods else
mdl_methods <- glb_mdl_family_lst[[mdl_id_pfx]]
for (method in mdl_methods) {
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars <- setdiff(indep_vars, c(".rnorm"))
#mdl_id <- paste0(mdl_id_pfx, ".no.rnorm")
}
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", mdl_id_pfx), major.inc = FALSE,
label.minor = method)
# The last([[:digit:]]+)(.*)\\.ctg feats are taking a long time for this experiment & nor proving to be important
indep_vars <- indep_vars[!grepl("\\.last([[:digit:]]+)(.*)\\.ctg", indep_vars)]
# The poly\\.([[:digit:]]+)\\.ctg feats are taking a long time for this experiment & nor proving to be important
indep_vars <- indep_vars[!grepl("\\.poly\\.([[:digit:]]+)\\.ctg", indep_vars)]
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type,
tune.df =
if ((mdl_id_pfx %in% "All.X") && (method %in% "glmnet")) glmnet_tune_models_df else
glb_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
#trainControl.allowParallel = FALSE,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = fitobs_df, OOB_df = glbObsOOB)
}
}
## label step_major step_minor label_minor bgn end
## 1 fit.models_1_bgn 1 0 setup 342.944 342.955
## 2 fit.models_1_All.X 1 1 setup 342.956 NA
## elapsed
## 1 0.011
## 2 NA
## label step_major step_minor label_minor bgn end
## 2 fit.models_1_All.X 1 1 setup 342.956 342.965
## 3 fit.models_1_All.X 1 2 glmnet 342.966 NA
## elapsed
## 2 0.01
## 3 NA
## [1] "fitting model: All.X##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0934 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 5700 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 57 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.70850276
## NDSSName.my.fctr#Multimedia#
## -0.03054293
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.60964877
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 1.94929790
## NDSSName.my.fctr#U.S.#Education
## -0.26727710
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.16615102
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.12950830
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.23033621
## NDSSName.my.fctrCulture#Arts#
## -0.17355619
## NDSSName.my.fctrForeign#World#
## -0.14144794
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.30359599
## NDSSName.my.fctrOpEd#Opinion#
## 2.48784290
## NDSSName.my.fctrScience#Health#
## 1.99084311
## NDSSName.my.fctrStyles##Fashion
## -0.25265329
## NDSSName.my.fctrStyles#U.S.#
## 1.82382955
## NDSSName.my.fctrTStyle##
## -0.42107301
## NDSSName.my.fctrTravel#Travel#
## -0.11335741
## PubDate.day.minutes.poly.1
## 9.80483966
## PubDate.day.minutes.poly.2
## 1.82278821
## PubDate.day.minutes.poly.4
## 4.04394519
## PubDate.hour.fctr(15.3,23]
## 0.04018079
## PubDate.last2.log1p
## 0.01063398
## PubDate.last4.log1p
## 0.01736546
## PubDate.wkend
## 0.15331409
## WordCount.log1p
## 0.14957685
## WordCount.root2
## 0.02369866
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.836566257
## NDSSName.my.fctr#Multimedia#
## -0.057640450
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.690659501
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.057495643
## NDSSName.my.fctr#U.S.#Education
## -0.297208511
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.177778863
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.158140199
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.307234767
## NDSSName.my.fctrCulture#Arts#
## -0.187805623
## NDSSName.my.fctrForeign#World#
## -0.169688427
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.341074056
## NDSSName.my.fctrOpEd#Opinion#
## 2.575223003
## NDSSName.my.fctrScience#Health#
## 2.067085985
## NDSSName.my.fctrStyles##Fashion
## -0.291552509
## NDSSName.my.fctrStyles#U.S.#
## 1.899896438
## NDSSName.my.fctrTStyle##
## -0.447864596
## NDSSName.my.fctrTravel#Travel#
## -0.145852260
## NDSSName.my.fctrmyOther
## -0.020871349
## PubDate.day.minutes.poly.1
## 10.208863233
## PubDate.day.minutes.poly.2
## 2.133565453
## PubDate.day.minutes.poly.4
## 4.405958526
## PubDate.hour.fctr(15.3,23]
## 0.042371837
## PubDate.last2.log1p
## 0.012177521
## PubDate.last4.log1p
## 0.019001562
## PubDate.last8.log1p
## 0.001906538
## PubDate.wkend
## 0.162732276
## WordCount.log1p
## 0.156162477
## WordCount.root2
## 0.024707718
## Prediction
## Reference N Y
## N 3790 151
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.317236e-01 7.655936e-01 9.242213e-01 9.386958e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 7.764099e-112 1.674653e-01
## Prediction
## Reference N Y
## N 874 624
## Y 24 206
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.250000e-01 2.276988e-01 6.016877e-01 6.478869e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.967674e-122
## id
## 1 All.X##rcv#glmnet
## feats
## 1 WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 8.373 0.647
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8398093 0.9774169 0.7022016 0.9585097
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.8070588 0.9261737
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9242213 0.9386958 0.7273256
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5873513 0.9399199 0.2347826 0.8158791
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.3886792 0.625
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6016877 0.6478869 0.2276988
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.004371691 0.01854375
# Check if other preProcess methods improve model performance
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_preProc", major.inc = FALSE,
label.minor = "preProc")
## label step_major step_minor label_minor bgn end
## 3 fit.models_1_All.X 1 2 glmnet 342.966 359.438
## 4 fit.models_1_preProc 1 3 preProc 359.439 NA
## elapsed
## 3 16.472
## 4 NA
mdl_id <- orderBy(get_model_sel_frmla(), glb_models_df)[1, "id"]
indep_vars_vctr <- trim(unlist(strsplit(glb_models_df[glb_models_df$id == mdl_id,
"feats"], "[,]")))
method <- tail(unlist(strsplit(mdl_id, "[.]")), 1)
mdl_id_pfx <- paste0(head(unlist(strsplit(mdl_id, "[.]")), -1), collapse = ".")
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glb_id_var] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
} else fitobs_df <- glbObsFit
for (prePr in glb_preproc_methods) {
# The operations are applied in this order:
# Box-Cox/Yeo-Johnson transformation, centering, scaling, range, imputation, PCA, ICA then spatial sign.
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix=mdl_id_pfx,
type=glb_model_type, tune.df=glb_tune_models_df,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds,
trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method=method, train.preProcess=prePr)),
indep_vars=indep_vars_vctr, rsp_var=glb_rsp_var,
fit_df=fitobs_df, OOB_df=glbObsOOB)
}
# If (All|RFE).X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indep_vars_vctr
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(mdl_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# orig_glm <- glb_models_lst[["RFE.X.glm"]]$finalModel; print(summary(orig_glm))
# require(car)
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# # if vif errors out with "there are aliased coefficients in the model"
# alias_orig_glm <- alias(orig_glm); alias_complete_orig_glm <- (alias_orig_glm$Complete > 0); alias_complete_orig_glm <- alias_complete_orig_glm[rowSums(alias_complete_orig_glm) > 0, colSums(alias_complete_orig_glm) > 0]; print(alias_complete_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glbObsFit[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE), ]
# all.equal(glbObsAll$S.chrs.uppr.n.log, glbObsAll$A.chrs.uppr.n.log)
# cor(glbObsAll$S.T.herald, glbObsAll$S.T.tribun)
# mydspObs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glbObsAll[, setdiff(names(glbObsAll), myfind_chr_cols_df(glbObsAll))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(mdl_id=paste0(mdl_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
# User specified
# Ensure at least 2 vars in each regression; else varImp crashes
# sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df; sav_featsimp_df <- glb_featsimp_df; all.equal(sav_featsimp_df, glb_featsimp_df)
# glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df; glm_featsimp_df <- sav_featsimp_df
# easier to exclude features
# require(gdata) # needed for trim
# mdl_id <- "";
# indep_vars_vctr <- head(subset(glb_models_df, grepl("All\\.X\\.", mdl_id), select=feats)
# , 1)[, "feats"]
# indep_vars_vctr <- trim(unlist(strsplit(indep_vars_vctr, "[,]")))
# indep_vars_vctr <- setdiff(indep_vars_vctr, ".rnorm")
# easier to include features
#stop(here"); sav_models_df <- glb_models_df; glb_models_df <- sav_models_df
# !_sp
# mdl_id <- "csm"; indep_vars_vctr <- c(NULL
# ,"prdline.my.fctr", "prdline.my.fctr:.clusterid.fctr"
# ,"prdline.my.fctr*biddable"
# #,"prdline.my.fctr*startprice.log"
# #,"prdline.my.fctr*startprice.diff"
# ,"prdline.my.fctr*condition.fctr"
# ,"prdline.my.fctr*D.terms.post.stop.n"
# #,"prdline.my.fctr*D.terms.post.stem.n"
# ,"prdline.my.fctr*cellular.fctr"
# # ,"<feat1>:<feat2>"
# )
# for (method in glbMdlMethods) {
# ret_lst <- myfit_mdl(mdl_id=mdl_id, model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glb_tune_models_df)
# csm_mdl_id <- paste0(mdl_id, ".", method)
# csm_featsimp_df <- myget_feats_importance(glb_models_lst[[paste0(mdl_id, ".",
# method)]]); print(head(csm_featsimp_df))
# }
###
# Ntv.1.lm <- lm(reformulate(indep_vars_vctr, glb_rsp_var), glbObsTrn); print(summary(Ntv.1.lm))
#glb_models_df[, "max.Accuracy.OOB", FALSE]
#varImp(glb_models_lst[["Low.cor.X.glm"]])
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.2.glm"]])$imp)
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.3.glm"]])$imp)
#glb_feats_df[grepl("npnct28", glb_feats_df$id), ]
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glbObsFit),
# union(glb_rsp_var, glbFeatsExclude)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(mdl_id=paste0(mdl_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glb_tune_models_df,
# model_loss_mtrx=glbMdlMetric_terms,
# model_summaryFunction=glbMdlMetricSummaryFn,
# model_metric=glbMdlMetricSummary,
# model_metric_maximize=glbMdlMetricMaximize)
# Simplify a model
# fit_df <- glbObsFit; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glbObsFit, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glbMdlMetric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## id
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y.rcv.3X1##rcv#glmnet Max.cor.Y.rcv.3X1##rcv#glmnet
## Max.cor.Y.rcv.3X3##rcv#glmnet Max.cor.Y.rcv.3X3##rcv#glmnet
## Max.cor.Y.rcv.3X5##rcv#glmnet Max.cor.Y.rcv.3X5##rcv#glmnet
## Max.cor.Y.rcv.5X1##rcv#glmnet Max.cor.Y.rcv.5X1##rcv#glmnet
## Max.cor.Y.rcv.5X3##rcv#glmnet Max.cor.Y.rcv.5X3##rcv#glmnet
## Max.cor.Y.rcv.5X5##rcv#glmnet Max.cor.Y.rcv.5X5##rcv#glmnet
## Max.cor.Y.rcv.1X1.cp.0###rpart Max.cor.Y.rcv.1X1.cp.0###rpart
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## feats
## MFO###myMFO_classfr .rnorm
## Random###myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1###glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.1X1.cp.0###rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y##rcv#rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.Time.Poly##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## Max.cor.Y.Time.Lag##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg
## Interact.High.cor.Y##rcv#glmnet WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr
## Low.cor.X##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## All.X##rcv#glmnet WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5
## max.nTuningRuns min.elapsedtime.everything
## MFO###myMFO_classfr 0 0.301
## Random###myrandom_classfr 0 0.305
## Max.cor.Y.rcv.1X1###glmnet 0 1.067
## Max.cor.Y.rcv.3X1##rcv#glmnet 25 2.563
## Max.cor.Y.rcv.3X3##rcv#glmnet 25 4.736
## Max.cor.Y.rcv.3X5##rcv#glmnet 25 7.231
## Max.cor.Y.rcv.5X1##rcv#glmnet 25 3.463
## Max.cor.Y.rcv.5X3##rcv#glmnet 25 6.779
## Max.cor.Y.rcv.5X5##rcv#glmnet 25 9.270
## Max.cor.Y.rcv.1X1.cp.0###rpart 0 0.969
## Max.cor.Y##rcv#rpart 5 3.421
## Max.cor.Y.Time.Poly##rcv#glmnet 25 12.606
## Max.cor.Y.Time.Lag##rcv#glmnet 5 39.600
## Interact.High.cor.Y##rcv#glmnet 25 5.072
## Low.cor.X##rcv#glmnet 5 76.732
## All.X##rcv#glmnet 5 8.373
## min.elapsedtime.final max.AUCpROC.fit
## MFO###myMFO_classfr 0.003 0.5000000
## Random###myrandom_classfr 0.001 0.4990604
## Max.cor.Y.rcv.1X1###glmnet 0.278 0.8790544
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.277 0.8767919
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.278 0.8767919
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.278 0.8767919
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.280 0.8784031
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.278 0.8784031
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.280 0.8784031
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.076 0.8821543
## Max.cor.Y##rcv#rpart 0.074 0.8709432
## Max.cor.Y.Time.Poly##rcv#glmnet 1.702 0.8734975
## Max.cor.Y.Time.Lag##rcv#glmnet 2.782 0.8621088
## Interact.High.cor.Y##rcv#glmnet 0.322 0.8776419
## Low.cor.X##rcv#glmnet 4.950 0.8624894
## All.X##rcv#glmnet 0.647 0.8398093
## max.Sens.fit max.Spec.fit max.AUCROCR.fit
## MFO###myMFO_classfr 1.0000000 0.0000000 0.5000000
## Random###myrandom_classfr 0.8312611 0.1668598 0.4972757
## Max.cor.Y.rcv.1X1###glmnet 0.9632073 0.7949015 0.9608594
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9705658 0.7937428 0.9504198
## Max.cor.Y##rcv#rpart 0.9632073 0.7786790 0.8746354
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9659985 0.7809965 0.9534659
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9652372 0.7589803 0.9558908
## Interact.High.cor.Y##rcv#glmnet 0.9626998 0.7925840 0.9625372
## Low.cor.X##rcv#glmnet 0.9659985 0.7589803 0.9588640
## All.X##rcv#glmnet 0.9774169 0.7022016 0.9585097
## opt.prob.threshold.fit max.f.score.fit
## MFO###myMFO_classfr 0.1 0.3045703
## Random###myrandom_classfr 0.1 0.3045703
## Max.cor.Y.rcv.1X1###glmnet 0.5 0.8099174
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4 0.8235294
## Max.cor.Y##rcv#rpart 0.6 0.8000000
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4 0.8103957
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2 0.8075338
## Interact.High.cor.Y##rcv#glmnet 0.4 0.8084359
## Low.cor.X##rcv#glmnet 0.2 0.8077374
## All.X##rcv#glmnet 0.3 0.8070588
## max.Accuracy.fit max.AccuracyLower.fit
## MFO###myMFO_classfr 0.1796420 0.1688795
## Random###myrandom_classfr 0.1796420 0.1688795
## Max.cor.Y.rcv.1X1###glmnet 0.9329725 0.9255302
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9335973 0.9255302
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9333193 0.9255302
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9332218 0.9255302
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9331818 0.9259666
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9333905 0.9259666
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9331816 0.9259666
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9381765 0.9309917
## Max.cor.Y##rcv#rpart 0.9296422 0.9224771
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9319320 0.9257484
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9279769 0.9244394
## Interact.High.cor.Y##rcv#glmnet 0.9315850 0.9244394
## Low.cor.X##rcv#glmnet 0.9276303 0.9242213
## All.X##rcv#glmnet 0.9261737 0.9242213
## max.AccuracyUpper.fit max.Kappa.fit
## MFO###myMFO_classfr 0.1907952 0.0000000
## Random###myrandom_classfr 0.1907952 0.0000000
## Max.cor.Y.rcv.1X1###glmnet 0.9398832 0.7692476
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9398832 0.7691678
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9398832 0.7690803
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9398832 0.7686375
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9402789 0.7689055
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9402789 0.7698577
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9402789 0.7691429
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9448229 0.7860827
## Max.cor.Y##rcv#rpart 0.9371115 0.7515134
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9400811 0.7628290
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9388938 0.7473218
## Interact.High.cor.Y##rcv#glmnet 0.9388938 0.7641040
## Low.cor.X##rcv#glmnet 0.9386958 0.7453708
## All.X##rcv#glmnet 0.9386958 0.7273256
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## MFO###myMFO_classfr 0.5000000 1.0000000 0.0000000
## Random###myrandom_classfr 0.5125675 0.8077437 0.2173913
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9218959 0.3130435
## Max.cor.Y##rcv#rpart 0.5870523 0.9045394 0.2695652
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9118825 0.2782609
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9158879 0.2695652
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9105474 0.2913043
## Low.cor.X##rcv#glmnet 0.5917252 0.9138852 0.2695652
## All.X##rcv#glmnet 0.5873513 0.9399199 0.2347826
## max.AUCROCR.OOB opt.prob.threshold.OOB
## MFO###myMFO_classfr 0.5000000 0.1
## Random###myrandom_classfr 0.4857956 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.8116126 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7773858 0.1
## Max.cor.Y##rcv#rpart 0.5892132 0.6
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7997373 0.1
## Max.cor.Y.Time.Lag##rcv#glmnet 0.8024758 0.1
## Interact.High.cor.Y##rcv#glmnet 0.8140971 0.1
## Low.cor.X##rcv#glmnet 0.8052766 0.1
## All.X##rcv#glmnet 0.8158791 0.1
## max.f.score.OOB max.Accuracy.OOB
## MFO###myMFO_classfr 0.2349336 0.1331019
## Random###myrandom_classfr 0.2349336 0.1331019
## Max.cor.Y.rcv.1X1###glmnet 0.4405405 0.7604167
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4207493 0.7673611
## Max.cor.Y##rcv#rpart 0.2850575 0.8200231
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4191617 0.7754630
## Max.cor.Y.Time.Lag##rcv#glmnet 0.4118565 0.7818287
## Interact.High.cor.Y##rcv#glmnet 0.4398340 0.7656250
## Low.cor.X##rcv#glmnet 0.4152672 0.7783565
## All.X##rcv#glmnet 0.3886792 0.6250000
## max.AccuracyLower.OOB
## MFO###myMFO_classfr 0.1174298
## Random###myrandom_classfr 0.1174298
## Max.cor.Y.rcv.1X1###glmnet 0.7395703
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7365992
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7365992
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7365992
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7395703
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7395703
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7395703
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7467059
## Max.cor.Y##rcv#rpart 0.8010821
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7550404
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7615963
## Interact.High.cor.Y##rcv#glmnet 0.7449213
## Low.cor.X##rcv#glmnet 0.7580195
## All.X##rcv#glmnet 0.6016877
## max.AccuracyUpper.OOB max.Kappa.OOB
## MFO###myMFO_classfr 0.1500310 0.0000000
## Random###myrandom_classfr 0.1500310 0.0000000
## Max.cor.Y.rcv.1X1###glmnet 0.7803749 0.3148374
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7775689 0.3107477
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7775689 0.3107477
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7775689 0.3107477
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7803749 0.3373693
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7803749 0.3373693
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7803749 0.3373693
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7871043 0.2953321
## Max.cor.Y##rcv#rpart 0.8378705 0.1825002
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7949457 0.2963401
## Max.cor.Y.Time.Lag##rcv#glmnet 0.8010994 0.2908072
## Interact.High.cor.Y##rcv#glmnet 0.7854227 0.3156027
## Low.cor.X##rcv#glmnet 0.7977437 0.2931798
## All.X##rcv#glmnet 0.6478869 0.2276988
## max.AccuracySD.fit max.KappaSD.fit
## MFO###myMFO_classfr NA NA
## Random###myrandom_classfr NA NA
## Max.cor.Y.rcv.1X1###glmnet NA NA
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.007015493 0.02403706
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.005178375 0.01754365
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.005396525 0.01835474
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.008837283 0.03133449
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.006138477 0.02161286
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.006213800 0.02210061
## Max.cor.Y.rcv.1X1.cp.0###rpart NA NA
## Max.cor.Y##rcv#rpart 0.005069520 0.01910910
## Max.cor.Y.Time.Poly##rcv#glmnet 0.005321823 0.01901796
## Max.cor.Y.Time.Lag##rcv#glmnet 0.004678817 0.01649089
## Interact.High.cor.Y##rcv#glmnet 0.005250654 0.01810996
## Low.cor.X##rcv#glmnet 0.004942454 0.01832711
## All.X##rcv#glmnet 0.004371691 0.01854375
rm(ret_lst)
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end", major.inc = FALSE,
label.minor = "teardown")
## label step_major step_minor label_minor bgn end
## 4 fit.models_1_preProc 1 3 preProc 359.439 359.515
## 5 fit.models_1_end 1 4 teardown 359.515 NA
## elapsed
## 4 0.076
## 5 NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 11 fit.models 6 1 1 328.370 359.524 31.155
## 12 fit.models 6 2 2 359.525 NA NA
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 setup 361.256 NA NA
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## id
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y.rcv.3X1##rcv#glmnet Max.cor.Y.rcv.3X1##rcv#glmnet
## Max.cor.Y.rcv.3X3##rcv#glmnet Max.cor.Y.rcv.3X3##rcv#glmnet
## Max.cor.Y.rcv.3X5##rcv#glmnet Max.cor.Y.rcv.3X5##rcv#glmnet
## Max.cor.Y.rcv.5X1##rcv#glmnet Max.cor.Y.rcv.5X1##rcv#glmnet
## Max.cor.Y.rcv.5X3##rcv#glmnet Max.cor.Y.rcv.5X3##rcv#glmnet
## Max.cor.Y.rcv.5X5##rcv#glmnet Max.cor.Y.rcv.5X5##rcv#glmnet
## Max.cor.Y.rcv.1X1.cp.0###rpart Max.cor.Y.rcv.1X1.cp.0###rpart
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## feats
## MFO###myMFO_classfr .rnorm
## Random###myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1###glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.1X1.cp.0###rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y##rcv#rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.Time.Poly##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## Max.cor.Y.Time.Lag##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg
## Interact.High.cor.Y##rcv#glmnet WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr
## Low.cor.X##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## All.X##rcv#glmnet WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5
## max.nTuningRuns max.AUCpROC.fit
## MFO###myMFO_classfr 0 0.5000000
## Random###myrandom_classfr 0 0.4990604
## Max.cor.Y.rcv.1X1###glmnet 0 0.8790544
## Max.cor.Y.rcv.3X1##rcv#glmnet 25 0.8767919
## Max.cor.Y.rcv.3X3##rcv#glmnet 25 0.8767919
## Max.cor.Y.rcv.3X5##rcv#glmnet 25 0.8767919
## Max.cor.Y.rcv.5X1##rcv#glmnet 25 0.8784031
## Max.cor.Y.rcv.5X3##rcv#glmnet 25 0.8784031
## Max.cor.Y.rcv.5X5##rcv#glmnet 25 0.8784031
## Max.cor.Y.rcv.1X1.cp.0###rpart 0 0.8821543
## Max.cor.Y##rcv#rpart 5 0.8709432
## Max.cor.Y.Time.Poly##rcv#glmnet 25 0.8734975
## Max.cor.Y.Time.Lag##rcv#glmnet 5 0.8621088
## Interact.High.cor.Y##rcv#glmnet 25 0.8776419
## Low.cor.X##rcv#glmnet 5 0.8624894
## All.X##rcv#glmnet 5 0.8398093
## max.Sens.fit max.Spec.fit max.AUCROCR.fit
## MFO###myMFO_classfr 1.0000000 0.0000000 0.5000000
## Random###myrandom_classfr 0.8312611 0.1668598 0.4972757
## Max.cor.Y.rcv.1X1###glmnet 0.9632073 0.7949015 0.9608594
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9705658 0.7937428 0.9504198
## Max.cor.Y##rcv#rpart 0.9632073 0.7786790 0.8746354
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9659985 0.7809965 0.9534659
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9652372 0.7589803 0.9558908
## Interact.High.cor.Y##rcv#glmnet 0.9626998 0.7925840 0.9625372
## Low.cor.X##rcv#glmnet 0.9659985 0.7589803 0.9588640
## All.X##rcv#glmnet 0.9774169 0.7022016 0.9585097
## opt.prob.threshold.fit max.f.score.fit
## MFO###myMFO_classfr 0.1 0.3045703
## Random###myrandom_classfr 0.1 0.3045703
## Max.cor.Y.rcv.1X1###glmnet 0.5 0.8099174
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4 0.8235294
## Max.cor.Y##rcv#rpart 0.6 0.8000000
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4 0.8103957
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2 0.8075338
## Interact.High.cor.Y##rcv#glmnet 0.4 0.8084359
## Low.cor.X##rcv#glmnet 0.2 0.8077374
## All.X##rcv#glmnet 0.3 0.8070588
## max.Accuracy.fit max.Kappa.fit
## MFO###myMFO_classfr 0.1796420 0.0000000
## Random###myrandom_classfr 0.1796420 0.0000000
## Max.cor.Y.rcv.1X1###glmnet 0.9329725 0.7692476
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9335973 0.7691678
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9333193 0.7690803
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9332218 0.7686375
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9331818 0.7689055
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9333905 0.7698577
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9331816 0.7691429
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9381765 0.7860827
## Max.cor.Y##rcv#rpart 0.9296422 0.7515134
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9319320 0.7628290
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9279769 0.7473218
## Interact.High.cor.Y##rcv#glmnet 0.9315850 0.7641040
## Low.cor.X##rcv#glmnet 0.9276303 0.7453708
## All.X##rcv#glmnet 0.9261737 0.7273256
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## MFO###myMFO_classfr 0.5000000 1.0000000 0.0000000
## Random###myrandom_classfr 0.5125675 0.8077437 0.2173913
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9218959 0.3130435
## Max.cor.Y##rcv#rpart 0.5870523 0.9045394 0.2695652
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9118825 0.2782609
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9158879 0.2695652
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9105474 0.2913043
## Low.cor.X##rcv#glmnet 0.5917252 0.9138852 0.2695652
## All.X##rcv#glmnet 0.5873513 0.9399199 0.2347826
## max.AUCROCR.OOB opt.prob.threshold.OOB
## MFO###myMFO_classfr 0.5000000 0.1
## Random###myrandom_classfr 0.4857956 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.8116126 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7773858 0.1
## Max.cor.Y##rcv#rpart 0.5892132 0.6
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7997373 0.1
## Max.cor.Y.Time.Lag##rcv#glmnet 0.8024758 0.1
## Interact.High.cor.Y##rcv#glmnet 0.8140971 0.1
## Low.cor.X##rcv#glmnet 0.8052766 0.1
## All.X##rcv#glmnet 0.8158791 0.1
## max.f.score.OOB max.Accuracy.OOB
## MFO###myMFO_classfr 0.2349336 0.1331019
## Random###myrandom_classfr 0.2349336 0.1331019
## Max.cor.Y.rcv.1X1###glmnet 0.4405405 0.7604167
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4207493 0.7673611
## Max.cor.Y##rcv#rpart 0.2850575 0.8200231
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4191617 0.7754630
## Max.cor.Y.Time.Lag##rcv#glmnet 0.4118565 0.7818287
## Interact.High.cor.Y##rcv#glmnet 0.4398340 0.7656250
## Low.cor.X##rcv#glmnet 0.4152672 0.7783565
## All.X##rcv#glmnet 0.3886792 0.6250000
## max.Kappa.OOB inv.elapsedtime.everything
## MFO###myMFO_classfr 0.0000000 3.32225914
## Random###myrandom_classfr 0.0000000 3.27868852
## Max.cor.Y.rcv.1X1###glmnet 0.3148374 0.93720712
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.3107477 0.39016777
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.3107477 0.21114865
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.3107477 0.13829346
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.3373693 0.28876697
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.3373693 0.14751438
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.3373693 0.10787487
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.2953321 1.03199174
## Max.cor.Y##rcv#rpart 0.1825002 0.29231219
## Max.cor.Y.Time.Poly##rcv#glmnet 0.2963401 0.07932730
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2908072 0.02525253
## Interact.High.cor.Y##rcv#glmnet 0.3156027 0.19716088
## Low.cor.X##rcv#glmnet 0.2931798 0.01303237
## All.X##rcv#glmnet 0.2276988 0.11943151
## inv.elapsedtime.final
## MFO###myMFO_classfr 333.3333333
## Random###myrandom_classfr 1000.0000000
## Max.cor.Y.rcv.1X1###glmnet 3.5971223
## Max.cor.Y.rcv.3X1##rcv#glmnet 3.6101083
## Max.cor.Y.rcv.3X3##rcv#glmnet 3.5971223
## Max.cor.Y.rcv.3X5##rcv#glmnet 3.5971223
## Max.cor.Y.rcv.5X1##rcv#glmnet 3.5714286
## Max.cor.Y.rcv.5X3##rcv#glmnet 3.5971223
## Max.cor.Y.rcv.5X5##rcv#glmnet 3.5714286
## Max.cor.Y.rcv.1X1.cp.0###rpart 13.1578947
## Max.cor.Y##rcv#rpart 13.5135135
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5875441
## Max.cor.Y.Time.Lag##rcv#glmnet 0.3594536
## Interact.High.cor.Y##rcv#glmnet 3.1055901
## Low.cor.X##rcv#glmnet 0.2020202
## All.X##rcv#glmnet 1.5455951
print(myplot_radar(radar_inp_df=plt_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
## Warning: Removed 200 rows containing missing values (geom_point).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(mdl_id %in% grep("random|MFO", plt_models_df$id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "id", FALSE]
pltCI_models_df <- glb_models_df[, "id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
# mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("id", "model_method")],
# all.x=TRUE)
png(paste0(glb_out_pfx, "models_bar.png"), width=480*3, height=480*2)
#print(gp <- myplot_bar(mltd_models_df, "id", "value", colorcol_name="model_method") +
print(gp <- myplot_bar(df=mltd_models_df, xcol_name="id", ycol_names="value") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=mdl_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
## Warning: Removed 4 rows containing missing values (geom_errorbar).
dev.off()
## quartz_off_screen
## 2
print(gp)
## Warning: Removed 4 rows containing missing values (geom_errorbar).
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
# if (glb_is_classification && glb_is_binomial)
# dsp_models_cols <- c(dsp_models_cols, "opt.prob.threshold.OOB")
print(dsp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols])
## id
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.rcv.1X1.cp.0###rpart Max.cor.Y.rcv.1X1.cp.0###rpart
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y.rcv.5X3##rcv#glmnet Max.cor.Y.rcv.5X3##rcv#glmnet
## Max.cor.Y.rcv.5X1##rcv#glmnet Max.cor.Y.rcv.5X1##rcv#glmnet
## Max.cor.Y.rcv.5X5##rcv#glmnet Max.cor.Y.rcv.5X5##rcv#glmnet
## Max.cor.Y.rcv.3X1##rcv#glmnet Max.cor.Y.rcv.3X1##rcv#glmnet
## Max.cor.Y.rcv.3X3##rcv#glmnet Max.cor.Y.rcv.3X3##rcv#glmnet
## Max.cor.Y.rcv.3X5##rcv#glmnet Max.cor.Y.rcv.3X5##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## max.Accuracy.OOB max.AUCROCR.OOB
## Max.cor.Y##rcv#rpart 0.8200231 0.5892132
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7818287 0.8024758
## Low.cor.X##rcv#glmnet 0.7783565 0.8052766
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7754630 0.7997373
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7673611 0.7773858
## Interact.High.cor.Y##rcv#glmnet 0.7656250 0.8140971
## Max.cor.Y.rcv.1X1###glmnet 0.7604167 0.8116126
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7575231 0.8067975
## All.X##rcv#glmnet 0.6250000 0.8158791
## MFO###myMFO_classfr 0.1331019 0.5000000
## Random###myrandom_classfr 0.1331019 0.4857956
## max.AUCpROC.OOB max.Accuracy.fit
## Max.cor.Y##rcv#rpart 0.5870523 0.9296422
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9279769
## Low.cor.X##rcv#glmnet 0.5917252 0.9276303
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9319320
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9381765
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9315850
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9329725
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9333905
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9331818
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9331816
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9335973
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9333193
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9332218
## All.X##rcv#glmnet 0.5873513 0.9261737
## MFO###myMFO_classfr 0.5000000 0.1796420
## Random###myrandom_classfr 0.5125675 0.1796420
## opt.prob.threshold.fit
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2
## Low.cor.X##rcv#glmnet 0.2
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4
## Interact.High.cor.Y##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1###glmnet 0.5
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4
## All.X##rcv#glmnet 0.3
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
## opt.prob.threshold.OOB
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1
## Low.cor.X##rcv#glmnet 0.1
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.1
## Interact.High.cor.Y##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.1
## All.X##rcv#glmnet 0.1
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
print(myplot_radar(radar_inp_df = dsp_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
## Warning: Removed 70 rows containing missing values (geom_point).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
print("Metrics used for model selection:"); print(get_model_sel_frmla())
## [1] "Metrics used for model selection:"
## ~-max.Accuracy.OOB - max.AUCROCR.OOB - max.AUCpROC.OOB - max.Accuracy.fit -
## opt.prob.threshold.OOB
## <environment: 0x7fa2ea47a920>
print(sprintf("Best model id: %s", dsp_models_df[1, "id"]))
## [1] "Best model id: Max.cor.Y##rcv#rpart"
glb_get_predictions <- function(df, mdl_id, rsp_var, prob_threshold_def=NULL, verbose=FALSE) {
mdl <- glb_models_lst[[mdl_id]]
clmnNames <- mygetPredictIds(rsp_var, mdl_id)
predct_var_name <- clmnNames$value
predct_prob_var_name <- clmnNames$prob
predct_accurate_var_name <- clmnNames$is.acc
predct_error_var_name <- clmnNames$err
predct_erabs_var_name <- clmnNames$err.abs
if (glb_is_regression) {
df[, predct_var_name] <- predict(mdl, newdata=df, type="raw")
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] - df[, glb_rsp_var]
if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="auto"))
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_erabs_var_name] <- abs(df[, predct_error_var_name])
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, predct_prob_var_name] <- predict(mdl, newdata = df, type = "prob")[, 2]
df[, predct_var_name] <-
factor(levels(df[, glb_rsp_var])[
(df[, predct_prob_var_name] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
# facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="auto"))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
# if prediction is a TP (true +ve), measure distance from 1.0
tp <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[tp, predct_erabs_var_name] <- abs(1 - df[tp, predct_prob_var_name])
#rowIx <- which.max(df[tp, predct_erabs_var_name]); df[tp, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a TN (true -ve), measure distance from 0.0
tn <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[tn, predct_erabs_var_name] <- abs(0 - df[tn, predct_prob_var_name])
#rowIx <- which.max(df[tn, predct_erabs_var_name]); df[tn, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FP (flse +ve), measure distance from 0.0
fp <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[fp, predct_erabs_var_name] <- abs(0 - df[fp, predct_prob_var_name])
#rowIx <- which.max(df[fp, predct_erabs_var_name]); df[fp, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FN (flse -ve), measure distance from 1.0
fn <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[fn, predct_erabs_var_name] <- abs(1 - df[fn, predct_prob_var_name])
#rowIx <- which.max(df[fn, predct_erabs_var_name]); df[fn, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && !glb_is_binomial) {
df[, predct_var_name] <- predict(mdl, newdata = df, type = "raw")
df[, paste0(predct_var_name, ".prob")] <-
predict(mdl, newdata = df, type = "prob")
stop("Multinomial prediction error calculation needs to be implemented...")
}
return(df)
}
#stop(here"); glb2Sav(); glbObsAll <- savObsAll; glbObsTrn <- savObsTrn; glbObsFit <- savObsFit; glbObsOOB <- savObsOOB; sav_models_df <- glb_models_df; glb_models_df <- sav_models_df; glb_featsimp_df <- sav_featsimp_df
myget_category_stats <- function(obs_df, mdl_id, label) {
require(dplyr)
require(lazyeval)
predct_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$value
predct_error_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$err.abs
if (!predct_var_name %in% names(obs_df))
obs_df <- glb_get_predictions(obs_df, mdl_id, glb_rsp_var)
tmp_obs_df <- obs_df[, c(glbFeatsCategory, glb_rsp_var,
predct_var_name, predct_error_var_name)]
# tmp_obs_df <- obs_df %>%
# dplyr::select_(glbFeatsCategory, glb_rsp_var, predct_var_name, predct_error_var_name)
#dplyr::rename(startprice.log10.predict.RFE.X.glmnet.err=error_abs_OOB)
names(tmp_obs_df)[length(names(tmp_obs_df))] <- paste0("err.abs.", label)
ret_ctgry_df <- tmp_obs_df %>%
dplyr::group_by_(glbFeatsCategory) %>%
dplyr::summarise_(#interp(~sum(abs(var)), var=as.name(glb_rsp_var)),
interp(~sum(var), var=as.name(paste0("err.abs.", label))),
interp(~mean(var), var=as.name(paste0("err.abs.", label))),
interp(~n()))
names(ret_ctgry_df) <- c(glbFeatsCategory,
#paste0(glb_rsp_var, ".abs.", label, ".sum"),
paste0("err.abs.", label, ".sum"),
paste0("err.abs.", label, ".mean"),
paste0(".n.", label))
ret_ctgry_df <- dplyr::ungroup(ret_ctgry_df)
#colSums(ret_ctgry_df[, -grep(glbFeatsCategory, names(ret_ctgry_df))])
return(ret_ctgry_df)
}
#print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
if (!is.null(glb_mdl_ensemble)) {
fit.models_2_chunk_df <- myadd_chunk(fit.models_2_chunk_df,
paste0("fit.models_2_", mdl_id_pfx), major.inc = TRUE,
label.minor = "ensemble")
mdl_id_pfx <- "Ensemble"
if (#(glb_is_regression) |
((glb_is_classification) & (!glb_is_binomial)))
stop("Ensemble models not implemented yet for multinomial classification")
mygetEnsembleAutoMdlIds <- function() {
tmp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)
row.names(tmp_models_df) <- tmp_models_df$id
mdl_threshold_pos <-
min(which(grepl("MFO|Random|Baseline", tmp_models_df$id))) - 1
mdlIds <- tmp_models_df$id[1:mdl_threshold_pos]
return(mdlIds[!grepl("Ensemble", mdlIds)])
}
if (glb_mdl_ensemble == "auto") {
glb_mdl_ensemble <- mygetEnsembleAutoMdlIds()
mdl_id_pfx <- paste0(mdl_id_pfx, ".auto")
} else if (grepl("^%<d-%", glb_mdl_ensemble)) {
glb_mdl_ensemble <- eval(parse(text =
str_trim(unlist(strsplit(glb_mdl_ensemble, "%<d-%"))[2])))
}
for (mdl_id in glb_mdl_ensemble) {
if (!(mdl_id %in% names(glb_models_lst))) {
warning("Model ", mdl_id, " in glb_model_ensemble not found !")
next
}
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id, glb_rsp_var)
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id, glb_rsp_var)
}
#mdl_id_pfx <- "Ensemble.RFE"; mdlId <- paste0(mdl_id_pfx, ".glmnet")
#glb_mdl_ensemble <- gsub(mygetPredictIds$value, "", grep("RFE\\.X\\.(?!Interact)", row.names(glb_featsimp_df), perl = TRUE, value = TRUE), fixed = TRUE)
#varImp(glb_models_lst[[mdlId]])
#cor_df <- data.frame(cor=cor(glbObsFit[, glb_rsp_var], glbObsFit[, paste(mygetPredictIds$value, glb_mdl_ensemble)], use="pairwise.complete.obs"))
#glbObsFit <- glb_get_predictions(df=glbObsFit, "Ensemble.glmnet", glb_rsp_var);print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="Ensemble.glmnet", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
### bid0_sp
# Better than MFO; models.n=28; min.RMSE.fit=0.0521233; err.abs.fit.sum=7.3631895
# old: Top x from auto; models.n= 5; min.RMSE.fit=0.06311047; err.abs.fit.sum=9.5937080
# RFE only ; models.n=16; min.RMSE.fit=0.05148588; err.abs.fit.sum=7.2875091
# RFE subset only ;models.n= 5; min.RMSE.fit=0.06040702; err.abs.fit.sum=9.059088
# RFE subset only ;models.n= 9; min.RMSE.fit=0.05933167; err.abs.fit.sum=8.7421288
# RFE subset only ;models.n=15; min.RMSE.fit=0.0584607; err.abs.fit.sum=8.5902066
# RFE subset only ;models.n=17; min.RMSE.fit=0.05496899; err.abs.fit.sum=8.0170431
# RFE subset only ;models.n=18; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
# RFE subset only ;models.n=16; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
### bid0_sp
### bid1_sp
# "auto"; err.abs.fit.sum=76.699774; min.RMSE.fit=0.2186429
# "RFE.X.*"; err.abs.fit.sum=; min.RMSE.fit=0.221114
### bid1_sp
indep_vars <- paste(mygetPredictIds(glb_rsp_var)$value, glb_mdl_ensemble, sep = "")
if (glb_is_classification)
indep_vars <- paste(indep_vars, ".prob", sep = "")
# Some models in glb_mdl_ensemble might not be fitted e.g. RFE.X.Interact
indep_vars <- intersect(indep_vars, names(glbObsFit))
# indep_vars <- grep(mygetPredictIds(glb_rsp_var)$value, names(glbObsFit), fixed=TRUE, value=TRUE)
# if (glb_is_regression)
# indep_vars <- indep_vars[!grepl("(err\\.abs|accurate)$", indep_vars)]
# if (glb_is_classification && glb_is_binomial)
# indep_vars <- grep("prob$", indep_vars, value=TRUE) else
# indep_vars <- indep_vars[!grepl("err$", indep_vars)]
#rfe_fit_ens_results <- myrun_rfe(glbObsFit, indep_vars)
for (method in c("glm", "glmnet")) {
for (trainControlMethod in
c("boot", "boot632", "cv", "repeatedcv"
#, "LOOCV" # tuneLength * nrow(fitDF)
, "LGOCV", "adaptive_cv"
#, "adaptive_boot" #error: adaptive$min should be less than 3
#, "adaptive_LGOCV" #error: adaptive$min should be less than 3
)) {
#sav_models_df <- glb_models_df; all.equal(sav_models_df, glb_models_df)
#glb_models_df <- sav_models_df; print(glb_models_df$id)
if ((method == "glm") && (trainControlMethod != "repeatedcv"))
# glm used only to identify outliers
next
ret_lst <- myfit_mdl(
mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = paste0(mdl_id_pfx, ".", trainControlMethod),
type = glb_model_type, tune.df = NULL,
trainControl.method = trainControlMethod,
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
}
dsp_models_df <- get_dsp_models_df()
}
if (is.null(glb_sel_mdl_id))
glb_sel_mdl_id <- dsp_models_df[1, "id"] else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
## [1] "User specified selection: All.X##rcv#glmnet"
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
## Length Class Mode
## a0 100 -none- numeric
## beta 5700 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 57 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.70850276
## NDSSName.my.fctr#Multimedia#
## -0.03054293
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.60964877
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 1.94929790
## NDSSName.my.fctr#U.S.#Education
## -0.26727710
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.16615102
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.12950830
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.23033621
## NDSSName.my.fctrCulture#Arts#
## -0.17355619
## NDSSName.my.fctrForeign#World#
## -0.14144794
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.30359599
## NDSSName.my.fctrOpEd#Opinion#
## 2.48784290
## NDSSName.my.fctrScience#Health#
## 1.99084311
## NDSSName.my.fctrStyles##Fashion
## -0.25265329
## NDSSName.my.fctrStyles#U.S.#
## 1.82382955
## NDSSName.my.fctrTStyle##
## -0.42107301
## NDSSName.my.fctrTravel#Travel#
## -0.11335741
## PubDate.day.minutes.poly.1
## 9.80483966
## PubDate.day.minutes.poly.2
## 1.82278821
## PubDate.day.minutes.poly.4
## 4.04394519
## PubDate.hour.fctr(15.3,23]
## 0.04018079
## PubDate.last2.log1p
## 0.01063398
## PubDate.last4.log1p
## 0.01736546
## PubDate.wkend
## 0.15331409
## WordCount.log1p
## 0.14957685
## WordCount.root2
## 0.02369866
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.836566257
## NDSSName.my.fctr#Multimedia#
## -0.057640450
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.690659501
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.057495643
## NDSSName.my.fctr#U.S.#Education
## -0.297208511
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.177778863
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.158140199
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.307234767
## NDSSName.my.fctrCulture#Arts#
## -0.187805623
## NDSSName.my.fctrForeign#World#
## -0.169688427
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.341074056
## NDSSName.my.fctrOpEd#Opinion#
## 2.575223003
## NDSSName.my.fctrScience#Health#
## 2.067085985
## NDSSName.my.fctrStyles##Fashion
## -0.291552509
## NDSSName.my.fctrStyles#U.S.#
## 1.899896438
## NDSSName.my.fctrTStyle##
## -0.447864596
## NDSSName.my.fctrTravel#Travel#
## -0.145852260
## NDSSName.my.fctrmyOther
## -0.020871349
## PubDate.day.minutes.poly.1
## 10.208863233
## PubDate.day.minutes.poly.2
## 2.133565453
## PubDate.day.minutes.poly.4
## 4.405958526
## PubDate.hour.fctr(15.3,23]
## 0.042371837
## PubDate.last2.log1p
## 0.012177521
## PubDate.last4.log1p
## 0.019001562
## PubDate.last8.log1p
## 0.001906538
## PubDate.wkend
## 0.162732276
## WordCount.log1p
## 0.156162477
## WordCount.root2
## 0.024707718
## [1] TRUE
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
print(sprintf("%s fit prediction diagnostics:", glb_sel_mdl_id))
## [1] "All.X##rcv#glmnet fit prediction diagnostics:"
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
print(sprintf("%s OOB prediction diagnostics:", glb_sel_mdl_id))
## [1] "All.X##rcv#glmnet OOB prediction diagnostics:"
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
glb_featsimp_df <-
myget_feats_importance(mdl=glb_sel_mdl, featsimp_df=NULL)
glb_featsimp_df[, paste0(glb_sel_mdl_id, ".imp")] <- glb_featsimp_df$imp
#mdl_id <-"RFE.X.glmnet"; glb_featsimp_df <- myget_feats_importance(glb_models_lst[[mdl_id]], glb_featsimp_df); glb_featsimp_df[, paste0(mdl_id, ".imp")] <- glb_featsimp_df$imp; print(glb_featsimp_df)
#print(head(sbst_featsimp_df <- subset(glb_featsimp_df, is.na(RFE.X.glmnet.imp) | (abs(RFE.X.YeoJohnson.glmnet.imp - RFE.X.glmnet.imp) > 0.0001), select=-imp)))
#print(orderBy(~ -cor.y.abs, subset(glb_feats_df, id %in% c(row.names(sbst_featsimp_df), "startprice.dcm1.is9", "D.weight.post.stop.sum"))))
print(glb_featsimp_df)
## imp
## PubDate.day.minutes.poly.1 100.000000
## PubDate.day.minutes.poly.4 46.372779
## NDSSName.my.fctrOpEd#Opinion# 29.922243
## NDSSName.my.fctrBusiness#Crosswords/Games# 27.460942
## PubDate.day.minutes.poly.2 25.434881
## NDSSName.my.fctrScience#Health# 25.239681
## NDSSName.my.fctr#Opinion#ThePublicEditor 25.093700
## NDSSName.my.fctrStyles#U.S.# 23.692755
## PubDate.wkend 7.735688
## WordCount.log1p 7.679961
## PubDate.hour.fctr(15.3,23] 6.634766
## WordCount.root2 6.473412
## PubDate.last4.log1p 6.419482
## PubDate.last2.log1p 6.356495
## PubDate.last8.log1p 6.260793
## .rnorm 6.246564
## NDSSName.my.fctrBusiness#Technology# 6.246564
## NDSSName.my.fctrCulture## 6.246564
## NDSSName.my.fctrMetro#N.Y./Region# 6.246564
## PubDate.date.fctr(7,13] 6.246564
## PubDate.date.fctr(13,19] 6.246564
## PubDate.date.fctr(19,25] 6.246564
## PubDate.date.fctr(25,31] 6.246564
## PubDate.day.minutes.poly.3 6.246564
## PubDate.day.minutes.poly.5 6.246564
## PubDate.hour.fctr(7.67,15.3] 6.246564
## PubDate.juliandate 6.246564
## PubDate.last16.log1p 6.246564
## PubDate.last32.log1p 6.246564
## PubDate.minute.fctr(14.8,29.5] 6.246564
## PubDate.minute.fctr(29.5,44.2] 6.246564
## PubDate.minute.fctr(44.2,59.1] 6.246564
## PubDate.month.fctr10 6.246564
## PubDate.month.fctr11 6.246564
## PubDate.month.fctr12 6.246564
## PubDate.second.fctr(14.8,29.5] 6.246564
## PubDate.second.fctr(29.5,44.2] 6.246564
## PubDate.second.fctr(44.2,59.1] 6.246564
## PubDate.wkday.fctr1 6.246564
## PubDate.wkday.fctr2 6.246564
## PubDate.wkday.fctr3 6.246564
## PubDate.wkday.fctr4 6.246564
## PubDate.wkday.fctr5 6.246564
## PubDate.wkday.fctr6 6.246564
## WordCount.nexp 6.246564
## NDSSName.my.fctrmyOther 6.090790
## NDSSName.my.fctr#Multimedia# 5.761664
## NDSSName.my.fctrTravel#Travel# 4.954982
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 4.834346
## NDSSName.my.fctrForeign#World# 4.726773
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 4.622150
## NDSSName.my.fctrCulture#Arts# 4.534053
## NDSSName.my.fctrStyles##Fashion 3.618082
## NDSSName.my.fctr#U.S.#Education 3.549679
## NDSSName.my.fctrForeign#World#AsiaPacific 3.157244
## NDSSName.my.fctrTStyle## 2.149822
## NDSSName.my.fctr#Opinion#RoomForDebate 0.000000
## All.X##rcv#glmnet.imp
## PubDate.day.minutes.poly.1 100.000000
## PubDate.day.minutes.poly.4 46.372779
## NDSSName.my.fctrOpEd#Opinion# 29.922243
## NDSSName.my.fctrBusiness#Crosswords/Games# 27.460942
## PubDate.day.minutes.poly.2 25.434881
## NDSSName.my.fctrScience#Health# 25.239681
## NDSSName.my.fctr#Opinion#ThePublicEditor 25.093700
## NDSSName.my.fctrStyles#U.S.# 23.692755
## PubDate.wkend 7.735688
## WordCount.log1p 7.679961
## PubDate.hour.fctr(15.3,23] 6.634766
## WordCount.root2 6.473412
## PubDate.last4.log1p 6.419482
## PubDate.last2.log1p 6.356495
## PubDate.last8.log1p 6.260793
## .rnorm 6.246564
## NDSSName.my.fctrBusiness#Technology# 6.246564
## NDSSName.my.fctrCulture## 6.246564
## NDSSName.my.fctrMetro#N.Y./Region# 6.246564
## PubDate.date.fctr(7,13] 6.246564
## PubDate.date.fctr(13,19] 6.246564
## PubDate.date.fctr(19,25] 6.246564
## PubDate.date.fctr(25,31] 6.246564
## PubDate.day.minutes.poly.3 6.246564
## PubDate.day.minutes.poly.5 6.246564
## PubDate.hour.fctr(7.67,15.3] 6.246564
## PubDate.juliandate 6.246564
## PubDate.last16.log1p 6.246564
## PubDate.last32.log1p 6.246564
## PubDate.minute.fctr(14.8,29.5] 6.246564
## PubDate.minute.fctr(29.5,44.2] 6.246564
## PubDate.minute.fctr(44.2,59.1] 6.246564
## PubDate.month.fctr10 6.246564
## PubDate.month.fctr11 6.246564
## PubDate.month.fctr12 6.246564
## PubDate.second.fctr(14.8,29.5] 6.246564
## PubDate.second.fctr(29.5,44.2] 6.246564
## PubDate.second.fctr(44.2,59.1] 6.246564
## PubDate.wkday.fctr1 6.246564
## PubDate.wkday.fctr2 6.246564
## PubDate.wkday.fctr3 6.246564
## PubDate.wkday.fctr4 6.246564
## PubDate.wkday.fctr5 6.246564
## PubDate.wkday.fctr6 6.246564
## WordCount.nexp 6.246564
## NDSSName.my.fctrmyOther 6.090790
## NDSSName.my.fctr#Multimedia# 5.761664
## NDSSName.my.fctrTravel#Travel# 4.954982
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 4.834346
## NDSSName.my.fctrForeign#World# 4.726773
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 4.622150
## NDSSName.my.fctrCulture#Arts# 4.534053
## NDSSName.my.fctrStyles##Fashion 3.618082
## NDSSName.my.fctr#U.S.#Education 3.549679
## NDSSName.my.fctrForeign#World#AsiaPacific 3.157244
## NDSSName.my.fctrTStyle## 2.149822
## NDSSName.my.fctr#Opinion#RoomForDebate 0.000000
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
if (!is.null(featsimp_df <- glb_featsimp_df)) {
featsimp_df$feat <- gsub("`(.*?)`", "\\1", row.names(featsimp_df))
featsimp_df$feat.interact <- gsub("(.*?):(.*)", "\\2", featsimp_df$feat)
featsimp_df$feat <- gsub("(.*?):(.*)", "\\1", featsimp_df$feat)
featsimp_df$feat.interact <-
ifelse(featsimp_df$feat.interact == featsimp_df$feat,
NA, featsimp_df$feat.interact)
featsimp_df$feat <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat)
featsimp_df$feat.interact <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat.interact)
featsimp_df <- orderBy(~ -imp.max,
summaryBy(imp ~ feat + feat.interact, data=featsimp_df,
FUN=max))
#rex_str=":(.*)"; txt_vctr=tail(featsimp_df$feat); ret_lst <- regexec(rex_str, txt_vctr); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
featsimp_df <- subset(featsimp_df, !is.na(imp.max))
if (nrow(featsimp_df) > 5) {
warning("Limiting important feature scatter plots to 5 out of ",
nrow(featsimp_df))
featsimp_df <- head(featsimp_df, 5)
}
# if (!all(is.na(featsimp_df$feat.interact)))
# stop("not implemented yet")
rsp_var_out <- mygetPredictIds(glb_rsp_var, mdl_id)$value
for (var in featsimp_df$feat) {
plot_df <- melt(obs_df, id.vars = var,
measure.vars = c(glb_rsp_var, rsp_var_out))
print(myplot_scatter(plot_df, var, "value", colorcol_name = "variable",
facet_colcol_name = "variable", jitter = TRUE) +
guides(color = FALSE))
}
}
if (glb_is_regression) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glb_id_var)
# + facet_wrap(reformulate(featsimp_df$feat[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df = obs_df,
feat_x = ifelse(nrow(featsimp_df) > 1,
featsimp_df$feat[2], ".rownames"),
feat_y = featsimp_df$feat[1],
rsp_var = glb_rsp_var,
rsp_var_out = rsp_var_out,
id_vars = glb_id_var,
prob_threshold = prob_threshold))
}
}
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
prob_threshold = glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id =
## glb_sel_mdl_id, : Limiting important feature scatter plots to 5 out of 23
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 3918 N 0.03894856
## 2 2555 N 0.02747149
## 3 302 N 0.24157107
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 N FALSE
## 2 N FALSE
## 3 Y TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 0.03894856
## 2 0.02747149
## 3 0.24157107
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 TRUE
## 2 TRUE
## 3 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 1 TRUE
## 2 TRUE
## 3 FALSE
## Popular.fctr.All.X..rcv.glmnet.error .label
## 1 0.0000000 3918
## 2 0.0000000 2555
## 3 0.1415711 302
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 172 Y 0.06307988
## 2 3554 Y 0.06508768
## 3 92 Y 0.06844409
## 4 3076 Y 0.07023428
## 5 6354 Y 0.07072164
## 6 4020 Y 0.07126866
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 N TRUE
## 2 N TRUE
## 3 N TRUE
## 4 N TRUE
## 5 N TRUE
## 6 N TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 0.9369201
## 2 0.9349123
## 3 0.9315559
## 4 0.9297657
## 5 0.9292784
## 6 0.9287313
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.All.X..rcv.glmnet.error
## 1 -0.03692012
## 2 -0.03491232
## 3 -0.03155591
## 4 -0.02976572
## 5 -0.02927836
## 6 -0.02873134
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 157 5877 N 0.1121261
## 333 2905 N 0.1357577
## 363 4501 N 0.1409147
## 397 3109 N 0.1510194
## 401 3763 N 0.1517372
## 637 483 N 0.7438254
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 157 Y TRUE
## 333 Y TRUE
## 363 Y TRUE
## 397 Y TRUE
## 401 Y TRUE
## 637 Y TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 157 0.1121261
## 333 0.1357577
## 363 0.1409147
## 397 0.1510194
## 401 0.1517372
## 637 0.7438254
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 157 FALSE
## 333 FALSE
## 363 FALSE
## 397 FALSE
## 401 FALSE
## 637 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 157 FALSE
## 333 FALSE
## 363 FALSE
## 397 FALSE
## 401 FALSE
## 637 FALSE
## Popular.fctr.All.X..rcv.glmnet.error
## 157 0.01212606
## 333 0.03575772
## 363 0.04091468
## 397 0.05101942
## 401 0.05173716
## 637 0.64382538
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 643 770 N 0.7843817
## 644 221 N 0.7910725
## 645 472 N 0.7916843
## 646 1448 N 0.8013989
## 647 3590 N 0.8014895
## 648 2995 N 0.8065219
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 643 Y TRUE
## 644 Y TRUE
## 645 Y TRUE
## 646 Y TRUE
## 647 Y TRUE
## 648 Y TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 643 0.7843817
## 644 0.7910725
## 645 0.7916843
## 646 0.8013989
## 647 0.8014895
## 648 0.8065219
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 643 FALSE
## 644 FALSE
## 645 FALSE
## 646 FALSE
## 647 FALSE
## 648 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 643 FALSE
## 644 FALSE
## 645 FALSE
## 646 FALSE
## 647 FALSE
## 648 FALSE
## Popular.fctr.All.X..rcv.glmnet.error
## 643 0.6843817
## 644 0.6910725
## 645 0.6916843
## 646 0.7013989
## 647 0.7014895
## 648 0.7065219
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
if (!is.null(glbFeatsCategory)) {
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsFit, mdl_id = glb_sel_mdl_id,
label = "fit"),
by = glbFeatsCategory, all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
label="OOB"),
#by=glbFeatsCategory, all=TRUE) glb_ctgry-df already contains .n.OOB ?
all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
if (any(grepl("OOB", glbMdlMetricsEval)))
print(orderBy(~-err.abs.OOB.mean, glbLvlCategory)) else
print(orderBy(~-err.abs.fit.mean, glbLvlCategory))
print(colSums(glbLvlCategory[, -grep(glbFeatsCategory, names(glbLvlCategory))]))
}
## NDSSName.my.fctr
## OpEd#Opinion# OpEd#Opinion#
## #Opinion#ThePublicEditor #Opinion#ThePublicEditor
## Styles#U.S.# Styles#U.S.#
## Business#Crosswords/Games# Business#Crosswords/Games#
## Science#Health# Science#Health#
## Business#Technology# Business#Technology#
## ## ##
## Business#BusinessDay#Dealbook Business#BusinessDay#Dealbook
## Metro#N.Y./Region# Metro#N.Y./Region#
## Culture#Arts# Culture#Arts#
## #Opinion#RoomForDebate #Opinion#RoomForDebate
## Styles##Fashion Styles##Fashion
## Business#BusinessDay#SmallBusiness Business#BusinessDay#SmallBusiness
## myOther myOther
## Travel#Travel# Travel#Travel#
## Culture## Culture##
## Foreign#World#AsiaPacific Foreign#World#AsiaPacific
## #Multimedia# #Multimedia#
## TStyle## TStyle##
## #U.S.#Education #U.S.#Education
## Foreign#World# Foreign#World#
## .n.OOB .n.Fit .n.Tst .freqRatio.Fit
## OpEd#Opinion# 89 437 164 0.090965862
## #Opinion#ThePublicEditor 4 16 10 0.003330558
## Styles#U.S.# 50 127 61 0.026436303
## Business#Crosswords/Games# 18 105 42 0.021856786
## Science#Health# 48 148 57 0.030807660
## Business#Technology# 126 213 114 0.044338052
## ## 371 913 342 0.190049958
## Business#BusinessDay#Dealbook 323 629 304 0.130932556
## Metro#N.Y./Region# 70 128 67 0.026644463
## Culture#Arts# 185 490 174 0.101998335
## #Opinion#RoomForDebate 20 42 20 0.008742714
## Styles##Fashion 15 104 15 0.021648626
## Business#BusinessDay#SmallBusiness 40 100 41 0.020815987
## myOther 5 33 5 0.006869276
## Travel#Travel# 34 83 35 0.017277269
## Culture## 1 NA 70 NA
## Foreign#World#AsiaPacific 53 150 56 0.031223980
## #Multimedia# 49 92 52 0.019150708
## TStyle## 101 623 105 0.129683597
## #U.S.#Education 82 243 89 0.050582848
## Foreign#World# 44 128 47 0.026644463
## .freqRatio.OOB .freqRatio.Tst
## OpEd#Opinion# 0.0515046296 0.087700535
## #Opinion#ThePublicEditor 0.0023148148 0.005347594
## Styles#U.S.# 0.0289351852 0.032620321
## Business#Crosswords/Games# 0.0104166667 0.022459893
## Science#Health# 0.0277777778 0.030481283
## Business#Technology# 0.0729166667 0.060962567
## ## 0.2146990741 0.182887701
## Business#BusinessDay#Dealbook 0.1869212963 0.162566845
## Metro#N.Y./Region# 0.0405092593 0.035828877
## Culture#Arts# 0.1070601852 0.093048128
## #Opinion#RoomForDebate 0.0115740741 0.010695187
## Styles##Fashion 0.0086805556 0.008021390
## Business#BusinessDay#SmallBusiness 0.0231481481 0.021925134
## myOther 0.0028935185 0.002673797
## Travel#Travel# 0.0196759259 0.018716578
## Culture## 0.0005787037 0.037433155
## Foreign#World#AsiaPacific 0.0306712963 0.029946524
## #Multimedia# 0.0283564815 0.027807487
## TStyle## 0.0584490741 0.056149733
## #U.S.#Education 0.0474537037 0.047593583
## Foreign#World# 0.0254629630 0.025133690
## err.abs.fit.sum err.abs.fit.mean .n.fit
## OpEd#Opinion# 169.951607 0.38890528 437
## #Opinion#ThePublicEditor 7.268793 0.45429959 16
## Styles#U.S.# 62.316078 0.49067778 127
## Business#Crosswords/Games# 37.669866 0.35876063 105
## Science#Health# 67.279323 0.45459002 148
## Business#Technology# 46.853632 0.21997010 213
## ## 132.849946 0.14550925 913
## Business#BusinessDay#Dealbook 96.801881 0.15389806 629
## Metro#N.Y./Region# 19.919519 0.15562125 128
## Culture#Arts# 60.255887 0.12297120 490
## #Opinion#RoomForDebate 6.538212 0.15567171 42
## Styles##Fashion 8.955804 0.08611350 104
## Business#BusinessDay#SmallBusiness 13.040079 0.13040079 100
## myOther 3.720227 0.11273415 33
## Travel#Travel# 6.799810 0.08192542 83
## Culture## NA NA NA
## Foreign#World#AsiaPacific 15.551484 0.10367656 150
## #Multimedia# 8.346537 0.09072323 92
## TStyle## 43.577253 0.06994744 623
## #U.S.#Education 15.475235 0.06368409 243
## Foreign#World# 8.939209 0.06983757 128
## err.abs.OOB.sum err.abs.OOB.mean
## OpEd#Opinion# 46.5657806 0.52321102
## #Opinion#ThePublicEditor 2.0095642 0.50239104
## Styles#U.S.# 23.6233012 0.47246602
## Business#Crosswords/Games# 8.3810907 0.46561615
## Science#Health# 22.1467580 0.46139079
## Business#Technology# 29.7301209 0.23595334
## ## 78.0396865 0.21034956
## Business#BusinessDay#Dealbook 66.1858222 0.20490967
## Metro#N.Y./Region# 13.4646433 0.19235205
## Culture#Arts# 34.6762293 0.18743908
## #Opinion#RoomForDebate 3.7374728 0.18687364
## Styles##Fashion 2.1602732 0.14401821
## Business#BusinessDay#SmallBusiness 5.6285877 0.14071469
## myOther 0.5676045 0.11352091
## Travel#Travel# 3.6267258 0.10666841
## Culture## 0.1027346 0.10273457
## Foreign#World#AsiaPacific 5.3397418 0.10074984
## #Multimedia# 4.8380137 0.09873497
## TStyle## 9.5104193 0.09416257
## #U.S.#Education 6.0190802 0.07340342
## Foreign#World# 3.1966422 0.07265096
## .n.OOB .n.Fit .n.Tst .freqRatio.Fit
## 1728.000000 NA 1870.000000 NA
## .freqRatio.OOB .freqRatio.Tst err.abs.fit.sum err.abs.fit.mean
## 1.000000 1.000000 NA NA
## .n.fit err.abs.OOB.sum err.abs.OOB.mean
## NA 369.550293 4.690311
write.csv(glbObsOOB[, c(glb_id_var,
grep(glb_rsp_var, names(glbObsOOB), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBobs.csv"), row.names=FALSE)
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "teardown")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 teardown 374.431 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 12 fit.models 6 2 2 359.525 374.443 14.918
## 13 fit.models 6 3 3 374.443 NA NA
# if (sum(is.na(glbObsAll$D.P.http)) > 0)
# stop("fit.models_3: Why is this happening ?")
#stop(here"); glb2Sav()
sync_glb_obs_df <- function() {
# Merge or cbind ?
for (col in setdiff(names(glbObsFit), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "Fit", col] <<- glbObsFit[, col]
for (col in setdiff(names(glbObsFit), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "Fit", col] <<- glbObsFit[, col]
if (all(is.na(glbObsNew[, glb_rsp_var])))
for (col in setdiff(names(glbObsOOB), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "OOB", col] <<- glbObsOOB[, col]
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "OOB", col] <<- glbObsOOB[, col]
}
sync_glb_obs_df()
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
if (glb_save_envir)
save(glb_feats_df,
glbObsAll, #glbObsTrn, glbObsFit, glbObsOOB, glbObsNew,
glb_models_df, dsp_models_df, glb_models_lst, glb_sel_mdl, glb_sel_mdl_id,
glb_model_type,
file=paste0(glb_out_pfx, "selmdl_dsk.RData"))
#load(paste0(glb_out_pfx, "selmdl_dsk.RData"))
rm(ret_lst)
## Warning in rm(ret_lst): object 'ret_lst' not found
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 13 fit.models 6 3 3 374.443 380.488
## 14 fit.data.training 7 0 0 380.488 NA
## elapsed
## 13 6.045
## 14 NA
7.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
#stop(here"); glb2Sav()
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_models_lst[[glb_fin_mdl_id]]
} else
# if (nrow(glbObsFit) + length(glbObsFitOutliers) == nrow(glbObsTrn))
if (!all(is.na(glbObsNew[, glb_rsp_var])))
{
warning("Final model same as glb_sel_mdl_id")
glb_fin_mdl_id <- paste0("Final.", glb_sel_mdl_id)
glb_fin_mdl <- glb_sel_mdl
glb_models_lst[[glb_fin_mdl_id]] <- glb_fin_mdl
} else {
# if (grepl("RFE", glb_sel_mdl_id) ||
# (!is.null(glb_mdl_ensemble) && grepl("RFE", glb_mdl_ensemble))) {
indep_vars <- myadjust_interaction_feats(subset(glb_feats_df,
!nzv & (exclude.as.feat != 1))[, "id"])
rfe_trn_results <-
myrun_rfe(glbObsTrn, indep_vars, glbRFESizes[["Final"]])
if (!isTRUE(all.equal(sort(predictors(rfe_trn_results)),
sort(predictors(rfe_fit_results))))) {
print("Diffs predictors(rfe_trn_results) vs. predictors(rfe_fit_results):")
print(setdiff(predictors(rfe_trn_results), predictors(rfe_fit_results)))
print("Diffs predictors(rfe_fit_results) vs. predictors(rfe_trn_results):")
print(setdiff(predictors(rfe_fit_results), predictors(rfe_trn_results)))
}
# }
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
# Fit selected models on glbObsTrn
for (mdl_id in gsub(".prob", "",
gsub(mygetPredictIds(glb_rsp_var)$value, "", row.names(mdlimp_df), fixed = TRUE),
fixed = TRUE)) {
mdl_id_components <- unlist(strsplit(mdl_id, "[.]"))
mdlIdPfx <- paste0(c(head(mdl_id_components, -1), "Train"),
collapse = ".")
if (grepl("RFE\\.X\\.", mdlIdPfx))
mdlIndepVars <- myadjust_interaction_feats(myextract_actual_feats(
predictors(rfe_trn_results))) else
mdlIndepVars <- trim(unlist(
strsplit(glb_models_df[glb_models_df$id == mdl_id, "feats"], "[,]")))
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdlIdPfx,
type = glb_model_type, tune.df = glb_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = tail(mdl_id_components, 1))),
indep_vars = mdlIndepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsTrn, OOB_df = NULL)
glbObsTrn <- glb_get_predictions(df = glbObsTrn,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
glbObsNew <- glb_get_predictions(df = glbObsNew,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
}
}
# "Final" model
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the mdl_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
if (glb_is_classification && glb_is_binomial)
indep_vars_vctr <- gsub("(.*)\\.(.*)\\.prob", "\\1\\.Train\\.\\2\\.prob",
row.names(mdlimp_df)) else
indep_vars_vctr <- gsub("(.*)\\.(.*)", "\\1\\.Train\\.\\2",
row.names(mdlimp_df))
} else
if (grepl("RFE.X", glb_sel_mdl_id, fixed = TRUE)) {
indep_vars_vctr <- myextract_actual_feats(predictors(rfe_trn_results))
} else indep_vars_vctr <-
trim(unlist(strsplit(glb_models_df[glb_models_df$id ==
glb_sel_mdl_id
, "feats"], "[,]")))
if (!is.null(glb_preproc_methods) &&
((match_pos <- regexpr(gsub(".", "\\.",
paste(glb_preproc_methods, collapse = "|"),
fixed = TRUE), glb_sel_mdl_id)) != -1))
ths_preProcess <- str_sub(glb_sel_mdl_id, match_pos,
match_pos + attr(match_pos, "match.length") - 1) else
ths_preProcess <- NULL
mdl_id_pfx <- ifelse(grepl("Ensemble", glb_sel_mdl_id),
"Final.Ensemble", "Final")
trnobs_df <- if (is.null(glbObsTrnOutliers[[mdl_id_pfx]])) glbObsTrn else
glbObsTrn[!(glbObsTrn[, glb_id_var] %in%
glbObsTrnOutliers[[mdl_id_pfx]]), ]
# Force fitting of Final.glm to identify outliers
#method_vctr <- unique(c("glm", myparseMdlId(glb_sel_mdl_id)$alg))
# or skip glm for speed
method_vctr <- myparseMdlId(glb_sel_mdl_id)$alg
for (method in method_vctr) {
#source("caret_nominalTrainWorkflow.R")
# glmnet requires at least 2 indep vars
if ((length(indep_vars_vctr) == 1) && (method %in% "glmnet"))
next
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = if (method %in% c("glm", "glmnet")) FALSE else TRUE,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method,
train.preProcess = ths_preProcess)),
indep_vars = indep_vars_vctr, rsp_var = glb_rsp_var,
fit_df = trnobs_df, OOB_df = NULL)
}
if ((length(method_vctr) == 1) || (method != "glm")) {
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "id"]
}
}
## +(rfe) fit Fold1.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep1 size: 60
## +(rfe) imp Fold1.Rep1
## -(rfe) imp Fold1.Rep1
## +(rfe) fit Fold1.Rep1 size: 32
## -(rfe) fit Fold1.Rep1 size: 32
## +(rfe) fit Fold1.Rep1 size: 16
## -(rfe) fit Fold1.Rep1 size: 16
## +(rfe) fit Fold1.Rep1 size: 8
## -(rfe) fit Fold1.Rep1 size: 8
## +(rfe) fit Fold1.Rep1 size: 4
## -(rfe) fit Fold1.Rep1 size: 4
## +(rfe) fit Fold1.Rep1 size: 2
## -(rfe) fit Fold1.Rep1 size: 2
## +(rfe) fit Fold2.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep1 size: 60
## +(rfe) imp Fold2.Rep1
## -(rfe) imp Fold2.Rep1
## +(rfe) fit Fold2.Rep1 size: 32
## -(rfe) fit Fold2.Rep1 size: 32
## +(rfe) fit Fold2.Rep1 size: 16
## -(rfe) fit Fold2.Rep1 size: 16
## +(rfe) fit Fold2.Rep1 size: 8
## -(rfe) fit Fold2.Rep1 size: 8
## +(rfe) fit Fold2.Rep1 size: 4
## -(rfe) fit Fold2.Rep1 size: 4
## +(rfe) fit Fold2.Rep1 size: 2
## -(rfe) fit Fold2.Rep1 size: 2
## +(rfe) fit Fold3.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep1 size: 60
## +(rfe) imp Fold3.Rep1
## -(rfe) imp Fold3.Rep1
## +(rfe) fit Fold3.Rep1 size: 32
## -(rfe) fit Fold3.Rep1 size: 32
## +(rfe) fit Fold3.Rep1 size: 16
## -(rfe) fit Fold3.Rep1 size: 16
## +(rfe) fit Fold3.Rep1 size: 8
## -(rfe) fit Fold3.Rep1 size: 8
## +(rfe) fit Fold3.Rep1 size: 4
## -(rfe) fit Fold3.Rep1 size: 4
## +(rfe) fit Fold3.Rep1 size: 2
## -(rfe) fit Fold3.Rep1 size: 2
## +(rfe) fit Fold1.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep2 size: 60
## +(rfe) imp Fold1.Rep2
## -(rfe) imp Fold1.Rep2
## +(rfe) fit Fold1.Rep2 size: 32
## -(rfe) fit Fold1.Rep2 size: 32
## +(rfe) fit Fold1.Rep2 size: 16
## -(rfe) fit Fold1.Rep2 size: 16
## +(rfe) fit Fold1.Rep2 size: 8
## -(rfe) fit Fold1.Rep2 size: 8
## +(rfe) fit Fold1.Rep2 size: 4
## -(rfe) fit Fold1.Rep2 size: 4
## +(rfe) fit Fold1.Rep2 size: 2
## -(rfe) fit Fold1.Rep2 size: 2
## +(rfe) fit Fold2.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep2 size: 60
## +(rfe) imp Fold2.Rep2
## -(rfe) imp Fold2.Rep2
## +(rfe) fit Fold2.Rep2 size: 32
## -(rfe) fit Fold2.Rep2 size: 32
## +(rfe) fit Fold2.Rep2 size: 16
## -(rfe) fit Fold2.Rep2 size: 16
## +(rfe) fit Fold2.Rep2 size: 8
## -(rfe) fit Fold2.Rep2 size: 8
## +(rfe) fit Fold2.Rep2 size: 4
## -(rfe) fit Fold2.Rep2 size: 4
## +(rfe) fit Fold2.Rep2 size: 2
## -(rfe) fit Fold2.Rep2 size: 2
## +(rfe) fit Fold3.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep2 size: 60
## +(rfe) imp Fold3.Rep2
## -(rfe) imp Fold3.Rep2
## +(rfe) fit Fold3.Rep2 size: 32
## -(rfe) fit Fold3.Rep2 size: 32
## +(rfe) fit Fold3.Rep2 size: 16
## -(rfe) fit Fold3.Rep2 size: 16
## +(rfe) fit Fold3.Rep2 size: 8
## -(rfe) fit Fold3.Rep2 size: 8
## +(rfe) fit Fold3.Rep2 size: 4
## -(rfe) fit Fold3.Rep2 size: 4
## +(rfe) fit Fold3.Rep2 size: 2
## -(rfe) fit Fold3.Rep2 size: 2
## +(rfe) fit Fold1.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep3 size: 60
## +(rfe) imp Fold1.Rep3
## -(rfe) imp Fold1.Rep3
## +(rfe) fit Fold1.Rep3 size: 32
## -(rfe) fit Fold1.Rep3 size: 32
## +(rfe) fit Fold1.Rep3 size: 16
## -(rfe) fit Fold1.Rep3 size: 16
## +(rfe) fit Fold1.Rep3 size: 8
## -(rfe) fit Fold1.Rep3 size: 8
## +(rfe) fit Fold1.Rep3 size: 4
## -(rfe) fit Fold1.Rep3 size: 4
## +(rfe) fit Fold1.Rep3 size: 2
## -(rfe) fit Fold1.Rep3 size: 2
## +(rfe) fit Fold2.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep3 size: 60
## +(rfe) imp Fold2.Rep3
## -(rfe) imp Fold2.Rep3
## +(rfe) fit Fold2.Rep3 size: 32
## -(rfe) fit Fold2.Rep3 size: 32
## +(rfe) fit Fold2.Rep3 size: 16
## -(rfe) fit Fold2.Rep3 size: 16
## +(rfe) fit Fold2.Rep3 size: 8
## -(rfe) fit Fold2.Rep3 size: 8
## +(rfe) fit Fold2.Rep3 size: 4
## -(rfe) fit Fold2.Rep3 size: 4
## +(rfe) fit Fold2.Rep3 size: 2
## -(rfe) fit Fold2.Rep3 size: 2
## +(rfe) fit Fold3.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep3 size: 60
## +(rfe) imp Fold3.Rep3
## -(rfe) imp Fold3.Rep3
## +(rfe) fit Fold3.Rep3 size: 32
## -(rfe) fit Fold3.Rep3 size: 32
## +(rfe) fit Fold3.Rep3 size: 16
## -(rfe) fit Fold3.Rep3 size: 16
## +(rfe) fit Fold3.Rep3 size: 8
## -(rfe) fit Fold3.Rep3 size: 8
## +(rfe) fit Fold3.Rep3 size: 4
## -(rfe) fit Fold3.Rep3 size: 4
## +(rfe) fit Fold3.Rep3 size: 2
## -(rfe) fit Fold3.Rep3 size: 2
## Warning in lda.default(x, grouping, ...): variables are collinear
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (3 fold, repeated 3 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.8204 0.03718 0.003445 0.01035
## 4 0.8760 0.44502 0.003122 0.01919
## 8 0.8738 0.44420 0.003283 0.02061
## 16 0.9016 0.63787 0.006552 0.02352
## 32 0.9013 0.63732 0.006547 0.02350
## 60 0.9029 0.64607 0.006800 0.02399 *
##
## The top 5 variables (out of 60):
## WordCount.log1p, WordCount.root2, WordCount.nexp, NDSSName.my.fctrOpEd#Opinion#, PubDate.day.minutes.poly.1
##
## [1] "WordCount.log1p"
## [2] "WordCount.root2"
## [3] "WordCount.nexp"
## [4] "NDSSName.my.fctrOpEd#Opinion#"
## [5] "PubDate.day.minutes.poly.1"
## [6] "PubDate.day.minutes.poly.4"
## [7] "PubDate.hour.fctr(15.3,23]"
## [8] "PubDate.last4.log1p"
## [9] "PubDate.last2.log1p"
## [10] "NDSSName.my.fctrScience#Health#"
## [11] "NDSSName.my.fctrBusiness#Crosswords/Games#"
## [12] "PubDate.day.minutes.poly.5"
## [13] "PubDate.last8.log1p"
## [14] "NDSSName.my.fctrStyles#U.S.#"
## [15] "PubDate.wkend"
## [16] "PubDate.last16.log1p"
## [17] "NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg"
## [18] "PubDate.day.minutes.poly.2"
## [19] "PubDate.juliandate"
## [20] "PubDate.wkday.fctr6"
## [21] "PubDate.month.fctr11"
## [22] "PubDate.second.fctr(14.8,29.5]"
## [23] "PubDate.date.fctr(7,13]"
## [24] ".rnorm"
## [25] "NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg"
## [26] "PubDate.wkday.fctr1"
## [27] "PubDate.day.minutes.poly.3"
## [28] "NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg"
## [29] "PubDate.date.fctr(25,31]"
## [30] "PubDate.last32.log1p"
## [31] "PubDate.hour.fctr(7.67,15.3]"
## [32] "NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg"
## [33] "PubDate.minute.fctr(14.8,29.5]"
## [34] "NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg"
## [35] "PubDate.month.fctr10"
## [36] "NDSSName.my.fctrBusiness#Technology#"
## [37] "NDSSName.my.fctrmyOther"
## [38] "PubDate.wkday.fctr3"
## [39] "PubDate.date.fctr(13,19]"
## [40] "PubDate.second.fctr(29.5,44.2]"
## [41] "PubDate.minute.fctr(44.2,59.1]"
## [42] "PubDate.wkday.fctr4"
## [43] "PubDate.second.fctr(44.2,59.1]"
## [44] "NDSSName.my.fctr#Opinion#RoomForDebate"
## [45] "PubDate.date.fctr(19,25]"
## [46] "NDSSName.my.fctrMetro#N.Y./Region#"
## [47] "NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness"
## [48] "NDSSName.my.fctrTravel#Travel#"
## [49] "NDSSName.my.fctrStyles##Fashion"
## [50] "NDSSName.my.fctr#Multimedia#"
## [51] "PubDate.wkday.fctr2"
## [52] "NDSSName.my.fctrForeign#World#"
## [53] "NDSSName.my.fctrForeign#World#AsiaPacific"
## [54] "PubDate.wkday.fctr5"
## [55] "PubDate.minute.fctr(29.5,44.2]"
## [56] "NDSSName.my.fctr#U.S.#Education"
## [57] "NDSSName.my.fctrCulture#Arts#"
## [58] "NDSSName.my.fctrBusiness#BusinessDay#Dealbook"
## [59] "NDSSName.my.fctr##"
## [60] "NDSSName.my.fctrTStyle##"
## [1] "fitting model: Final##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5"
## + Fold1.Rep1: alpha=0.100, lambda=0.07781
## - Fold1.Rep1: alpha=0.100, lambda=0.07781
## + Fold1.Rep1: alpha=0.325, lambda=0.07781
## - Fold1.Rep1: alpha=0.325, lambda=0.07781
## + Fold1.Rep1: alpha=0.550, lambda=0.07781
## - Fold1.Rep1: alpha=0.550, lambda=0.07781
## + Fold1.Rep1: alpha=0.775, lambda=0.07781
## - Fold1.Rep1: alpha=0.775, lambda=0.07781
## + Fold1.Rep1: alpha=1.000, lambda=0.07781
## - Fold1.Rep1: alpha=1.000, lambda=0.07781
## + Fold2.Rep1: alpha=0.100, lambda=0.07781
## - Fold2.Rep1: alpha=0.100, lambda=0.07781
## + Fold2.Rep1: alpha=0.325, lambda=0.07781
## - Fold2.Rep1: alpha=0.325, lambda=0.07781
## + Fold2.Rep1: alpha=0.550, lambda=0.07781
## - Fold2.Rep1: alpha=0.550, lambda=0.07781
## + Fold2.Rep1: alpha=0.775, lambda=0.07781
## - Fold2.Rep1: alpha=0.775, lambda=0.07781
## + Fold2.Rep1: alpha=1.000, lambda=0.07781
## - Fold2.Rep1: alpha=1.000, lambda=0.07781
## + Fold3.Rep1: alpha=0.100, lambda=0.07781
## - Fold3.Rep1: alpha=0.100, lambda=0.07781
## + Fold3.Rep1: alpha=0.325, lambda=0.07781
## - Fold3.Rep1: alpha=0.325, lambda=0.07781
## + Fold3.Rep1: alpha=0.550, lambda=0.07781
## - Fold3.Rep1: alpha=0.550, lambda=0.07781
## + Fold3.Rep1: alpha=0.775, lambda=0.07781
## - Fold3.Rep1: alpha=0.775, lambda=0.07781
## + Fold3.Rep1: alpha=1.000, lambda=0.07781
## - Fold3.Rep1: alpha=1.000, lambda=0.07781
## + Fold1.Rep2: alpha=0.100, lambda=0.07781
## - Fold1.Rep2: alpha=0.100, lambda=0.07781
## + Fold1.Rep2: alpha=0.325, lambda=0.07781
## - Fold1.Rep2: alpha=0.325, lambda=0.07781
## + Fold1.Rep2: alpha=0.550, lambda=0.07781
## - Fold1.Rep2: alpha=0.550, lambda=0.07781
## + Fold1.Rep2: alpha=0.775, lambda=0.07781
## - Fold1.Rep2: alpha=0.775, lambda=0.07781
## + Fold1.Rep2: alpha=1.000, lambda=0.07781
## - Fold1.Rep2: alpha=1.000, lambda=0.07781
## + Fold2.Rep2: alpha=0.100, lambda=0.07781
## - Fold2.Rep2: alpha=0.100, lambda=0.07781
## + Fold2.Rep2: alpha=0.325, lambda=0.07781
## - Fold2.Rep2: alpha=0.325, lambda=0.07781
## + Fold2.Rep2: alpha=0.550, lambda=0.07781
## - Fold2.Rep2: alpha=0.550, lambda=0.07781
## + Fold2.Rep2: alpha=0.775, lambda=0.07781
## - Fold2.Rep2: alpha=0.775, lambda=0.07781
## + Fold2.Rep2: alpha=1.000, lambda=0.07781
## - Fold2.Rep2: alpha=1.000, lambda=0.07781
## + Fold3.Rep2: alpha=0.100, lambda=0.07781
## - Fold3.Rep2: alpha=0.100, lambda=0.07781
## + Fold3.Rep2: alpha=0.325, lambda=0.07781
## - Fold3.Rep2: alpha=0.325, lambda=0.07781
## + Fold3.Rep2: alpha=0.550, lambda=0.07781
## - Fold3.Rep2: alpha=0.550, lambda=0.07781
## + Fold3.Rep2: alpha=0.775, lambda=0.07781
## - Fold3.Rep2: alpha=0.775, lambda=0.07781
## + Fold3.Rep2: alpha=1.000, lambda=0.07781
## - Fold3.Rep2: alpha=1.000, lambda=0.07781
## + Fold1.Rep3: alpha=0.100, lambda=0.07781
## - Fold1.Rep3: alpha=0.100, lambda=0.07781
## + Fold1.Rep3: alpha=0.325, lambda=0.07781
## - Fold1.Rep3: alpha=0.325, lambda=0.07781
## + Fold1.Rep3: alpha=0.550, lambda=0.07781
## - Fold1.Rep3: alpha=0.550, lambda=0.07781
## + Fold1.Rep3: alpha=0.775, lambda=0.07781
## - Fold1.Rep3: alpha=0.775, lambda=0.07781
## + Fold1.Rep3: alpha=1.000, lambda=0.07781
## - Fold1.Rep3: alpha=1.000, lambda=0.07781
## + Fold2.Rep3: alpha=0.100, lambda=0.07781
## - Fold2.Rep3: alpha=0.100, lambda=0.07781
## + Fold2.Rep3: alpha=0.325, lambda=0.07781
## - Fold2.Rep3: alpha=0.325, lambda=0.07781
## + Fold2.Rep3: alpha=0.550, lambda=0.07781
## - Fold2.Rep3: alpha=0.550, lambda=0.07781
## + Fold2.Rep3: alpha=0.775, lambda=0.07781
## - Fold2.Rep3: alpha=0.775, lambda=0.07781
## + Fold2.Rep3: alpha=1.000, lambda=0.07781
## - Fold2.Rep3: alpha=1.000, lambda=0.07781
## + Fold3.Rep3: alpha=0.100, lambda=0.07781
## - Fold3.Rep3: alpha=0.100, lambda=0.07781
## + Fold3.Rep3: alpha=0.325, lambda=0.07781
## - Fold3.Rep3: alpha=0.325, lambda=0.07781
## + Fold3.Rep3: alpha=0.550, lambda=0.07781
## - Fold3.Rep3: alpha=0.550, lambda=0.07781
## + Fold3.Rep3: alpha=0.775, lambda=0.07781
## - Fold3.Rep3: alpha=0.775, lambda=0.07781
## + Fold3.Rep3: alpha=1.000, lambda=0.07781
## - Fold3.Rep3: alpha=1.000, lambda=0.07781
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.00361 on full training set
## Length Class Mode
## a0 89 -none- numeric
## beta 5073 dgCMatrix S4
## df 89 -none- numeric
## dim 2 -none- numeric
## lambda 89 -none- numeric
## dev.ratio 89 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 57 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -6.88661237
## NDSSName.my.fctr#Multimedia#
## -0.69017131
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.75091575
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.16599950
## NDSSName.my.fctr#U.S.#Education
## -1.62723326
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.13226493
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.57337620
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.30038315
## NDSSName.my.fctrBusiness#Technology#
## 0.49759684
## NDSSName.my.fctrCulture#Arts#
## -0.02664973
## NDSSName.my.fctrForeign#World#
## -0.84857323
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.38630180
## NDSSName.my.fctrOpEd#Opinion#
## 3.78134805
## NDSSName.my.fctrScience#Health#
## 2.76279694
## NDSSName.my.fctrStyles##Fashion
## -1.04374095
## NDSSName.my.fctrStyles#U.S.#
## 2.53368117
## NDSSName.my.fctrTStyle##
## -1.17568321
## NDSSName.my.fctrTravel#Travel#
## -0.40646984
## NDSSName.my.fctrmyOther
## -0.92034972
## PubDate.date.fctr(13,19]
## -0.03969260
## PubDate.day.minutes.poly.1
## 14.45900129
## PubDate.day.minutes.poly.2
## 15.50466097
## PubDate.day.minutes.poly.3
## 2.12223169
## PubDate.day.minutes.poly.4
## 4.01900421
## PubDate.hour.fctr(7.67,15.3]
## 0.13654098
## PubDate.last16.log1p
## 0.09222896
## PubDate.last32.log1p
## 0.01367946
## PubDate.minute.fctr(29.5,44.2]
## -0.13143954
## PubDate.month.fctr11
## -0.05213495
## PubDate.second.fctr(44.2,59.1]
## -0.05593654
## PubDate.wkday.fctr1
## 0.08523292
## PubDate.wkday.fctr5
## -0.11897940
## PubDate.wkday.fctr6
## -0.07242575
## PubDate.wkend
## 0.30994932
## WordCount.log1p
## 0.35761159
## WordCount.root2
## 0.05338155
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -7.024434156
## NDSSName.my.fctr#Multimedia#
## -0.773459716
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.859705647
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.170283546
## NDSSName.my.fctr#U.S.#Education
## -1.742809921
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.166521901
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.624120498
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.290504426
## NDSSName.my.fctrBusiness#Technology#
## 0.487995233
## NDSSName.my.fctrCulture#Arts#
## -0.068894966
## NDSSName.my.fctrForeign#World#
## -0.958508086
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.466854531
## NDSSName.my.fctrOpEd#Opinion#
## 3.777480362
## NDSSName.my.fctrScience#Health#
## 2.750599151
## NDSSName.my.fctrStyles##Fashion
## -1.138065219
## NDSSName.my.fctrStyles#U.S.#
## 2.520151641
## NDSSName.my.fctrTStyle##
## -1.236297003
## NDSSName.my.fctrTravel#Travel#
## -0.481831991
## NDSSName.my.fctrmyOther
## -1.041571187
## PubDate.date.fctr(7,13]
## 0.002075209
## PubDate.date.fctr(13,19]
## -0.048106489
## PubDate.day.minutes.poly.1
## 14.872950090
## PubDate.day.minutes.poly.2
## 16.952838409
## PubDate.day.minutes.poly.3
## 2.192634396
## PubDate.day.minutes.poly.4
## 3.288655447
## PubDate.hour.fctr(7.67,15.3]
## 0.178973137
## PubDate.last16.log1p
## 0.098282543
## PubDate.last32.log1p
## 0.015961358
## PubDate.minute.fctr(29.5,44.2]
## -0.140081948
## PubDate.month.fctr11
## -0.062964474
## PubDate.second.fctr(44.2,59.1]
## -0.064448103
## PubDate.wkday.fctr1
## 0.093086616
## PubDate.wkday.fctr2
## -0.010913486
## PubDate.wkday.fctr5
## -0.128522776
## PubDate.wkday.fctr6
## -0.110017248
## PubDate.wkend
## 0.318726398
## WordCount.log1p
## 0.368532055
## WordCount.root2
## 0.053251505
## Prediction
## Reference N Y
## N 4981 458
## Y 228 865
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.949786e-01 6.523492e-01 8.872900e-01 9.023124e-01 8.326699e-01
## AccuracyPValue McnemarPValue
## 1.409434e-46 2.264774e-18
## id
## 1 Final##rcv#glmnet
## feats
## 1 WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 36.993 0.842
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.801372 0.96139 0.6413541 0.9355675
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.7160596 0.9065624
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.88729 0.9023124 0.6401418
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005702198 0.02639621
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor label_minor bgn end
## 14 fit.data.training 7 0 0 380.488 448.724
## 15 fit.data.training 7 1 1 448.725 NA
## elapsed
## 14 68.236
## 15 NA
#stop(here"); glb2Sav()
if (glb_is_classification && glb_is_binomial)
prob_threshold <- glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"] else
prob_threshold <- NULL
if (grepl("Ensemble", glb_fin_mdl_id)) {
# Get predictions for each model in ensemble; Outliers that have been moved to OOB might not have been predicted yet
mdlEnsembleComps <- unlist(str_split(subset(glb_models_df,
id == glb_fin_mdl_id)$feats, ","))
if (glb_is_classification && glb_is_binomial)
mdlEnsembleComps <- gsub("\\.prob$", "", mdlEnsembleComps)
mdlEnsembleComps <- gsub(paste0("^",
gsub(".", "\\.", mygetPredictIds(glb_rsp_var)$value, fixed = TRUE)),
"", mdlEnsembleComps)
for (mdl_id in mdlEnsembleComps) {
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
glbObsNew <- glb_get_predictions(df = glbObsNew, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
}
}
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = glb_fin_mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
## Warning in glb_get_predictions(df = glbObsTrn, mdl_id = glb_fin_mdl_id, :
## Using default probability threshold: 0.1
glb_featsimp_df <- myget_feats_importance(mdl=glb_fin_mdl,
featsimp_df=glb_featsimp_df)
glb_featsimp_df[, paste0(glb_fin_mdl_id, ".imp")] <- glb_featsimp_df$imp
print(glb_featsimp_df)
## All.X##rcv#glmnet.imp
## PubDate.day.minutes.poly.2 25.434881
## PubDate.day.minutes.poly.1 100.000000
## NDSSName.my.fctrOpEd#Opinion# 29.922243
## PubDate.day.minutes.poly.4 46.372779
## NDSSName.my.fctrBusiness#Crosswords/Games# 27.460942
## NDSSName.my.fctr#Opinion#ThePublicEditor 25.093700
## NDSSName.my.fctrScience#Health# 25.239681
## NDSSName.my.fctrStyles#U.S.# 23.692755
## PubDate.day.minutes.poly.3 6.246564
## NDSSName.my.fctrBusiness#Technology# 6.246564
## WordCount.log1p 7.679961
## PubDate.wkend 7.735688
## PubDate.hour.fctr(7.67,15.3] 6.246564
## PubDate.last16.log1p 6.246564
## PubDate.wkday.fctr1 6.246564
## WordCount.root2 6.473412
## PubDate.last32.log1p 6.246564
## PubDate.date.fctr(7,13] 6.246564
## .rnorm 6.246564
## NDSSName.my.fctrCulture## 6.246564
## NDSSName.my.fctrMetro#N.Y./Region# 6.246564
## PubDate.date.fctr(19,25] 6.246564
## PubDate.date.fctr(25,31] 6.246564
## PubDate.day.minutes.poly.5 6.246564
## PubDate.hour.fctr(15.3,23] 6.634766
## PubDate.juliandate 6.246564
## PubDate.last2.log1p 6.356495
## PubDate.last4.log1p 6.419482
## PubDate.last8.log1p 6.260793
## PubDate.minute.fctr(14.8,29.5] 6.246564
## PubDate.minute.fctr(44.2,59.1] 6.246564
## PubDate.month.fctr10 6.246564
## PubDate.month.fctr12 6.246564
## PubDate.second.fctr(14.8,29.5] 6.246564
## PubDate.second.fctr(29.5,44.2] 6.246564
## PubDate.wkday.fctr3 6.246564
## PubDate.wkday.fctr4 6.246564
## WordCount.nexp 6.246564
## PubDate.wkday.fctr2 6.246564
## PubDate.date.fctr(13,19] 6.246564
## NDSSName.my.fctrCulture#Arts# 4.534053
## PubDate.month.fctr11 6.246564
## PubDate.second.fctr(44.2,59.1] 6.246564
## PubDate.wkday.fctr6 6.246564
## PubDate.wkday.fctr5 6.246564
## PubDate.minute.fctr(29.5,44.2] 6.246564
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 4.622150
## NDSSName.my.fctrTravel#Travel# 4.954982
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 4.834346
## NDSSName.my.fctr#Multimedia# 5.761664
## NDSSName.my.fctrForeign#World# 4.726773
## NDSSName.my.fctrmyOther 6.090790
## NDSSName.my.fctrStyles##Fashion 3.618082
## NDSSName.my.fctrTStyle## 2.149822
## NDSSName.my.fctrForeign#World#AsiaPacific 3.157244
## NDSSName.my.fctr#U.S.#Education 3.549679
## NDSSName.my.fctr#Opinion#RoomForDebate 0.000000
## imp
## PubDate.day.minutes.poly.2 100.000000
## PubDate.day.minutes.poly.1 90.409031
## NDSSName.my.fctrOpEd#Opinion# 33.933845
## PubDate.day.minutes.poly.4 32.183176
## NDSSName.my.fctrBusiness#Crosswords/Games# 31.442333
## NDSSName.my.fctr#Opinion#ThePublicEditor 30.810953
## NDSSName.my.fctrScience#Health# 28.675494
## NDSSName.my.fctrStyles#U.S.# 27.494881
## PubDate.day.minutes.poly.3 25.727586
## NDSSName.my.fctrBusiness#Technology# 17.067558
## WordCount.log1p 16.433441
## PubDate.wkend 16.180212
## PubDate.hour.fctr(7.67,15.3] 15.428347
## PubDate.last16.log1p 15.052359
## PubDate.wkday.fctr1 15.023834
## WordCount.root2 14.827828
## PubDate.last32.log1p 14.634050
## PubDate.date.fctr(7,13] 14.563041
## .rnorm 14.554558
## NDSSName.my.fctrCulture## 14.554558
## NDSSName.my.fctrMetro#N.Y./Region# 14.554558
## PubDate.date.fctr(19,25] 14.554558
## PubDate.date.fctr(25,31] 14.554558
## PubDate.day.minutes.poly.5 14.554558
## PubDate.hour.fctr(15.3,23] 14.554558
## PubDate.juliandate 14.554558
## PubDate.last2.log1p 14.554558
## PubDate.last4.log1p 14.554558
## PubDate.last8.log1p 14.554558
## PubDate.minute.fctr(14.8,29.5] 14.554558
## PubDate.minute.fctr(44.2,59.1] 14.554558
## PubDate.month.fctr10 14.554558
## PubDate.month.fctr12 14.554558
## PubDate.second.fctr(14.8,29.5] 14.554558
## PubDate.second.fctr(29.5,44.2] 14.554558
## PubDate.wkday.fctr3 14.554558
## PubDate.wkday.fctr4 14.554558
## WordCount.nexp 14.554558
## PubDate.wkday.fctr2 14.509947
## PubDate.date.fctr(13,19] 14.316575
## NDSSName.my.fctrCulture#Arts# 14.245181
## PubDate.month.fctr11 14.242882
## PubDate.second.fctr(44.2,59.1] 14.232858
## PubDate.wkday.fctr6 14.029412
## PubDate.wkday.fctr5 13.905284
## PubDate.minute.fctr(29.5,44.2] 13.845057
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 13.736119
## NDSSName.my.fctrTravel#Travel# 12.161655
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 11.406198
## NDSSName.my.fctr#Multimedia# 10.674108
## NDSSName.my.fctrForeign#World# 9.752718
## NDSSName.my.fctrmyOther 9.338429
## NDSSName.my.fctrStyles##Fashion 8.815485
## NDSSName.my.fctrTStyle## 8.276532
## NDSSName.my.fctrForeign#World#AsiaPacific 7.114733
## NDSSName.my.fctr#U.S.#Education 5.735793
## NDSSName.my.fctr#Opinion#RoomForDebate 0.000000
## Final##rcv#glmnet.imp
## PubDate.day.minutes.poly.2 100.000000
## PubDate.day.minutes.poly.1 90.409031
## NDSSName.my.fctrOpEd#Opinion# 33.933845
## PubDate.day.minutes.poly.4 32.183176
## NDSSName.my.fctrBusiness#Crosswords/Games# 31.442333
## NDSSName.my.fctr#Opinion#ThePublicEditor 30.810953
## NDSSName.my.fctrScience#Health# 28.675494
## NDSSName.my.fctrStyles#U.S.# 27.494881
## PubDate.day.minutes.poly.3 25.727586
## NDSSName.my.fctrBusiness#Technology# 17.067558
## WordCount.log1p 16.433441
## PubDate.wkend 16.180212
## PubDate.hour.fctr(7.67,15.3] 15.428347
## PubDate.last16.log1p 15.052359
## PubDate.wkday.fctr1 15.023834
## WordCount.root2 14.827828
## PubDate.last32.log1p 14.634050
## PubDate.date.fctr(7,13] 14.563041
## .rnorm 14.554558
## NDSSName.my.fctrCulture## 14.554558
## NDSSName.my.fctrMetro#N.Y./Region# 14.554558
## PubDate.date.fctr(19,25] 14.554558
## PubDate.date.fctr(25,31] 14.554558
## PubDate.day.minutes.poly.5 14.554558
## PubDate.hour.fctr(15.3,23] 14.554558
## PubDate.juliandate 14.554558
## PubDate.last2.log1p 14.554558
## PubDate.last4.log1p 14.554558
## PubDate.last8.log1p 14.554558
## PubDate.minute.fctr(14.8,29.5] 14.554558
## PubDate.minute.fctr(44.2,59.1] 14.554558
## PubDate.month.fctr10 14.554558
## PubDate.month.fctr12 14.554558
## PubDate.second.fctr(14.8,29.5] 14.554558
## PubDate.second.fctr(29.5,44.2] 14.554558
## PubDate.wkday.fctr3 14.554558
## PubDate.wkday.fctr4 14.554558
## WordCount.nexp 14.554558
## PubDate.wkday.fctr2 14.509947
## PubDate.date.fctr(13,19] 14.316575
## NDSSName.my.fctrCulture#Arts# 14.245181
## PubDate.month.fctr11 14.242882
## PubDate.second.fctr(44.2,59.1] 14.232858
## PubDate.wkday.fctr6 14.029412
## PubDate.wkday.fctr5 13.905284
## PubDate.minute.fctr(29.5,44.2] 13.845057
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 13.736119
## NDSSName.my.fctrTravel#Travel# 12.161655
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 11.406198
## NDSSName.my.fctr#Multimedia# 10.674108
## NDSSName.my.fctrForeign#World# 9.752718
## NDSSName.my.fctrmyOther 9.338429
## NDSSName.my.fctrStyles##Fashion 8.815485
## NDSSName.my.fctrTStyle## 8.276532
## NDSSName.my.fctrForeign#World#AsiaPacific 7.114733
## NDSSName.my.fctr#U.S.#Education 5.735793
## NDSSName.my.fctr#Opinion#RoomForDebate 0.000000
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glbObsTrn, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 23
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 1065 N NA
## 2 4168 N 0.04115009
## 3 5647 N 0.12998063
## 4 302 N NA
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 <NA> NA
## 2 N FALSE
## 3 Y TRUE
## 4 <NA> NA
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 NA
## 2 0.04115009
## 3 0.12998063
## 4 NA
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 NA
## 2 TRUE
## 3 FALSE
## 4 NA
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 1 0.02895198 N
## 2 0.00862190 N
## 3 0.11218487 Y
## 4 0.42886804 Y
## Popular.fctr.Final..rcv.glmnet.err
## 1 FALSE
## 2 FALSE
## 3 TRUE
## 4 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 1 0.02895198
## 2 0.00862190
## 3 0.11218487
## 4 0.42886804
## Popular.fctr.Final..rcv.glmnet.is.acc
## 1 TRUE
## 2 TRUE
## 3 FALSE
## 4 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 1 TRUE
## 2 TRUE
## 3 FALSE
## 4 FALSE
## Popular.fctr.Final..rcv.glmnet.error .label
## 1 0.00000000 1065
## 2 0.00000000 4168
## 3 0.01218487 5647
## 4 0.32886804 302
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 2182 Y 0.04448175
## 2 4352 Y NA
## 3 4721 Y 0.06694741
## 4 1696 Y 0.05298739
## 5 5486 Y NA
## 6 364 Y 0.06969886
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 N TRUE
## 2 <NA> NA
## 3 N TRUE
## 4 N TRUE
## 5 <NA> NA
## 6 N TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 0.9555183
## 2 NA
## 3 0.9330526
## 4 0.9470126
## 5 NA
## 6 0.9303011
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 NA
## 3 FALSE
## 4 FALSE
## 5 NA
## 6 FALSE
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 1 0.007066632 N
## 2 0.011579123 N
## 3 0.014803392 N
## 4 0.014856448 N
## 5 0.017198535 N
## 6 0.019459474 N
## Popular.fctr.Final..rcv.glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 1 0.9929334
## 2 0.9884209
## 3 0.9851966
## 4 0.9851436
## 5 0.9828015
## 6 0.9805405
## Popular.fctr.Final..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.Final..rcv.glmnet.error
## 1 -0.09293337
## 2 -0.08842088
## 3 -0.08519661
## 4 -0.08514355
## 5 -0.08280147
## 6 -0.08054053
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 191 988 N 0.11691752
## 221 3609 N 0.13736586
## 382 4016 N 0.09835309
## 413 3378 N NA
## 909 1805 N 0.42155974
## 1110 6511 N NA
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 191 Y TRUE
## 221 Y TRUE
## 382 N FALSE
## 413 <NA> NA
## 909 Y TRUE
## 1110 <NA> NA
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 191 0.11691752
## 221 0.13736586
## 382 0.09835309
## 413 NA
## 909 0.42155974
## 1110 NA
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 191 FALSE
## 221 FALSE
## 382 TRUE
## 413 NA
## 909 FALSE
## 1110 NA
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 191 0.1117584 Y
## 221 0.1158289 Y
## 382 0.1346807 Y
## 413 0.1377206 Y
## 909 0.3730036 Y
## 1110 0.7233569 Y
## Popular.fctr.Final..rcv.glmnet.err
## 191 TRUE
## 221 TRUE
## 382 TRUE
## 413 TRUE
## 909 TRUE
## 1110 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 191 0.1117584
## 221 0.1158289
## 382 0.1346807
## 413 0.1377206
## 909 0.3730036
## 1110 0.7233569
## Popular.fctr.Final..rcv.glmnet.is.acc
## 191 FALSE
## 221 FALSE
## 382 FALSE
## 413 FALSE
## 909 FALSE
## 1110 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 191 FALSE
## 221 FALSE
## 382 FALSE
## 413 FALSE
## 909 FALSE
## 1110 FALSE
## Popular.fctr.Final..rcv.glmnet.error
## 191 0.01175842
## 221 0.01582887
## 382 0.03468074
## 413 0.03772058
## 909 0.27300359
## 1110 0.62335692
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1188 770 N NA
## 1189 2179 N 0.7715713
## 1190 472 N NA
## 1191 2995 N NA
## 1192 1612 N 0.7950623
## 1193 1448 N NA
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1188 <NA> NA
## 1189 Y TRUE
## 1190 <NA> NA
## 1191 <NA> NA
## 1192 Y TRUE
## 1193 <NA> NA
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1188 NA
## 1189 0.7715713
## 1190 NA
## 1191 NA
## 1192 0.7950623
## 1193 NA
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1188 NA
## 1189 FALSE
## 1190 NA
## 1191 NA
## 1192 FALSE
## 1193 NA
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 1188 0.9488795 Y
## 1189 0.9498766 Y
## 1190 0.9568880 Y
## 1191 0.9602272 Y
## 1192 0.9635908 Y
## 1193 0.9643650 Y
## Popular.fctr.Final..rcv.glmnet.err
## 1188 TRUE
## 1189 TRUE
## 1190 TRUE
## 1191 TRUE
## 1192 TRUE
## 1193 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 1188 0.9488795
## 1189 0.9498766
## 1190 0.9568880
## 1191 0.9602272
## 1192 0.9635908
## 1193 0.9643650
## Popular.fctr.Final..rcv.glmnet.is.acc
## 1188 FALSE
## 1189 FALSE
## 1190 FALSE
## 1191 FALSE
## 1192 FALSE
## 1193 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 1188 FALSE
## 1189 FALSE
## 1190 FALSE
## 1191 FALSE
## 1192 FALSE
## 1193 FALSE
## Popular.fctr.Final..rcv.glmnet.error
## 1188 0.8488795
## 1189 0.8498766
## 1190 0.8568880
## 1191 0.8602272
## 1192 0.8635908
## 1193 0.8643650
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
dsp_feats_vctr <- c(NULL)
for(var in grep(".imp", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glbObsTrn[glbObsTrn$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glbObsTrn), value=TRUE)])
print(setdiff(names(glbObsTrn), names(glbObsAll)))
## [1] "Popular.fctr.Final..rcv.glmnet.prob"
## [2] "Popular.fctr.Final..rcv.glmnet"
## [3] "Popular.fctr.Final..rcv.glmnet.err"
## [4] "Popular.fctr.Final..rcv.glmnet.err.abs"
## [5] "Popular.fctr.Final..rcv.glmnet.is.acc"
for (col in setdiff(names(glbObsTrn), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.src == "Train", col] <- glbObsTrn[, col]
print(setdiff(names(glbObsFit), names(glbObsAll)))
## character(0)
print(setdiff(names(glbObsOOB), names(glbObsAll)))
## character(0)
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.lcn == "OOB", col] <- glbObsOOB[, col]
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
if (glb_save_envir)
save(glb_feats_df, glbObsAll,
#glbObsTrn, glbObsFit, glbObsOOB, glbObsNew,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 15 fit.data.training 7 1 1 448.725 458.797
## 16 predict.data.new 8 0 0 458.798 NA
## elapsed
## 15 10.072
## 16 NA
8.0: predict data new## Warning in glb_get_predictions(obs_df, mdl_id = glb_fin_mdl_id, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.1
## Warning in glb_get_predictions(obs_df, mdl_id = glb_fin_mdl_id, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.1
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in glb_analytics_diag_plots(obs_df = glbObsNew, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 23
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## NULL
## Loading required package: stringr
## [1] "ObsNew Prediction errors in categories:"
## NDSSName.my.fctr .n.Trn.N .n.Trn.Y .n.New.N .n.New.Y
## 5 #U.S.#Education 325 NA 87 2
## 10 Culture## 1 NA 48 22
## .n.Trn.N .n.Trn.Y .n.New.N .n.New.Y
## 326 0 135 24
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
##
## The following object is masked from 'package:Matrix':
##
## expand
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet Y: min < min of Train range: 10"
## UniqueID Popular.fctr.All.X..rcv.glmnet
## 1631 1631 Y
## 5431 5431 Y
## 1906 1906 Y
## 3872 3872 Y
## 2645 2645 Y
## 6435 6435 Y
## 1767 1767 Y
## 1923 1923 Y
## 4223 4223 Y
## 1930 1930 Y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.2
## 1631 0.04035041 0.002185492
## 5431 0.18988524 -0.003298637
## 1906 -0.28422208 -0.007994015
## 3872 0.01079276 -0.008744657
## 2645 0.04444564 0.012594250
## 6435 -0.02734665 0.002105741
## 1767 -0.02245774 -0.004384985
## 1923 -0.06338396 -0.008641599
## 4223 -0.04543407 -0.008758791
## 1930 0.02392413 -0.007891784
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.4
## 1631 -0.164542461 -0.011653637
## 5431 0.160017937 -0.004425843
## 1906 0.189926123 0.006563830
## 3872 -0.104973596 0.009341415
## 2645 0.035996050 -0.018240126
## 6435 -0.017879840 -0.010140008
## 1767 -0.201374531 -0.002340509
## 1923 -0.156983579 0.009616977
## 4223 -0.006162687 0.009518864
## 1930 -0.159715310 0.008326544
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.5.ctg
## 1631 0.08825344 0.18274711
## 5431 -0.40164220 -0.54313226
## 1906 -0.30249370 0.41093522
## 3872 -0.51296095 -0.45711886
## 2645 -0.01082616 -0.03079946
## 6435 0.02178124 -0.02033356
## 1767 0.33305455 -0.13083307
## 1923 0.21319110 -0.10732959
## 4223 -0.02181201 -0.01124082
## 1930 0.19588297 0.13994102
## WordCount.log1p WordCount.root2
## 1631 5.598422 16.40122
## 5431 7.295056 38.36665
## 1906 7.160846 35.87478
## 3872 6.647688 27.74887
## 2645 6.635947 27.58623
## 6435 0.000000 0.00000
## 1767 6.979145 32.75668
## 1923 6.142037 21.54066
## 4223 7.274480 37.97368
## 1930 5.541264 15.93738
## id cor.y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.2 0.070977720
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.2.ctg 0.003596414
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## WordCount.log1p WordCount.log1p 0.254319628
## WordCount.root2 WordCount.root2 0.292120679
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## PubDate.day.minutes.poly.2 FALSE 0.070977720 <NA>
## PubDate.day.minutes.poly.2.ctg FALSE 0.003596414 <NA>
## PubDate.day.minutes.poly.4 FALSE 0.073941394 <NA>
## PubDate.day.minutes.poly.4.ctg FALSE 0.014601521 <NA>
## PubDate.day.minutes.poly.5.ctg FALSE 0.014574775 <NA>
## WordCount.log1p FALSE 0.254319628 WordCount.root2
## WordCount.root2 FALSE 0.292120679 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## PubDate.day.minutes.poly.2 1.225490 18.08022 FALSE FALSE
## PubDate.day.minutes.poly.2.ctg 1.083333 53.94979 FALSE FALSE
## PubDate.day.minutes.poly.4 1.225490 18.08022 FALSE FALSE
## PubDate.day.minutes.poly.4.ctg 1.083333 53.94979 FALSE FALSE
## PubDate.day.minutes.poly.5.ctg 1.083333 53.94979 FALSE FALSE
## WordCount.log1p 2.315789 24.15799 FALSE FALSE
## WordCount.root2 2.315789 24.15799 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## PubDate.day.minutes.poly.2 FALSE <NA>
## PubDate.day.minutes.poly.2.ctg TRUE NDSSName.my.fctr
## PubDate.day.minutes.poly.4 FALSE <NA>
## PubDate.day.minutes.poly.4.ctg FALSE NDSSName.my.fctr
## PubDate.day.minutes.poly.5.ctg FALSE NDSSName.my.fctr
## WordCount.log1p FALSE <NA>
## WordCount.root2 FALSE <NA>
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## PubDate.day.minutes.poly.2 8.020999e-64 FALSE NA
## PubDate.day.minutes.poly.2.ctg 2.302769e-65 FALSE NA
## PubDate.day.minutes.poly.4 1.523136e-47 FALSE NA
## PubDate.day.minutes.poly.4.ctg 2.214419e-67 FALSE NA
## PubDate.day.minutes.poly.5.ctg 7.171204e-67 FALSE NA
## WordCount.log1p 1.576866e-49 FALSE NA
## WordCount.root2 4.556481e-30 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1.ctg NA 0.48127714 -0.707011442
## PubDate.day.minutes.poly.2 NA 0.04268445 -0.008758791
## PubDate.day.minutes.poly.2.ctg NA 0.75539456 -0.221260607
## PubDate.day.minutes.poly.4 NA 0.06677441 -0.018327397
## PubDate.day.minutes.poly.4.ctg NA 0.67700049 -0.611884133
## PubDate.day.minutes.poly.5.ctg NA 0.56286316 -0.716534449
## WordCount.log1p NA 9.29771002 0.000000000
## WordCount.root2 NA 104.46051886 0.000000000
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg 0.35968907 0.32950245
## PubDate.day.minutes.poly.2 0.04268445 0.04254377
## PubDate.day.minutes.poly.2.ctg 0.75539456 0.43056671
## PubDate.day.minutes.poly.4 0.06543120 0.06149053
## PubDate.day.minutes.poly.4.ctg 0.67700049 0.28961875
## PubDate.day.minutes.poly.5.ctg 0.56286316 0.21585241
## WordCount.log1p 8.81966535 9.29771002
## WordCount.root2 82.24962006 104.46051886
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg -0.399776908 -0.245703803
## PubDate.day.minutes.poly.2 -0.008758791 -0.008758717
## PubDate.day.minutes.poly.2.ctg -0.221260607 -0.155122711
## PubDate.day.minutes.poly.4 -0.018327397 -0.018219595
## PubDate.day.minutes.poly.4.ctg -0.611884133 -0.282432189
## PubDate.day.minutes.poly.5.ctg -0.716534449 -0.370586479
## WordCount.log1p 0.000000000 1.945910149
## WordCount.root2 0.000000000 2.449489743
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.48127714
## PubDate.day.minutes.poly.2 0.04268445
## PubDate.day.minutes.poly.2.ctg 0.74739148
## PubDate.day.minutes.poly.4 0.06213811
## PubDate.day.minutes.poly.4.ctg 0.49508199
## PubDate.day.minutes.poly.5.ctg 0.48962874
## WordCount.log1p 7.05961763
## WordCount.root2 34.10278581
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.19773100
## PubDate.day.minutes.poly.2 0.04254377
## PubDate.day.minutes.poly.2.ctg 0.35042637
## PubDate.day.minutes.poly.4 0.06610094
## PubDate.day.minutes.poly.4.ctg 0.45727441
## PubDate.day.minutes.poly.5.ctg 0.41093522
## WordCount.log1p 9.14088311
## WordCount.root2 96.58157174
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.707011442
## PubDate.day.minutes.poly.2 -0.008758672
## PubDate.day.minutes.poly.2.ctg -0.201374531
## PubDate.day.minutes.poly.4 -0.018326850
## PubDate.day.minutes.poly.4.ctg -0.163870979
## PubDate.day.minutes.poly.5.ctg -0.211418410
## WordCount.log1p 0.000000000
## WordCount.root2 0.000000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.284222080
## PubDate.day.minutes.poly.2 -0.008758791
## PubDate.day.minutes.poly.2.ctg -0.201374531
## PubDate.day.minutes.poly.4 -0.018240126
## PubDate.day.minutes.poly.4.ctg -0.512960949
## PubDate.day.minutes.poly.5.ctg -0.543132262
## WordCount.log1p 0.000000000
## WordCount.root2 0.000000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.22609552
## PubDate.day.minutes.poly.2 0.04268445
## PubDate.day.minutes.poly.2.ctg 0.60416557
## PubDate.day.minutes.poly.4 0.05893666
## PubDate.day.minutes.poly.4.ctg 0.63819571
## PubDate.day.minutes.poly.5.ctg 0.45824974
## WordCount.log1p 7.94093976
## WordCount.root2 53.00000000
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.19641586
## PubDate.day.minutes.poly.2 0.04254377
## PubDate.day.minutes.poly.2.ctg 0.33756882
## PubDate.day.minutes.poly.4 0.06677441
## PubDate.day.minutes.poly.4.ctg 0.38235412
## PubDate.day.minutes.poly.5.ctg 0.42244492
## WordCount.log1p 8.69232228
## WordCount.root2 77.17512553
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.254654849
## PubDate.day.minutes.poly.2 -0.008758791
## PubDate.day.minutes.poly.2.ctg -0.191639985
## PubDate.day.minutes.poly.4 -0.018322678
## PubDate.day.minutes.poly.4.ctg -0.239606422
## PubDate.day.minutes.poly.5.ctg -0.354757272
## WordCount.log1p 0.000000000
## WordCount.root2 0.000000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.253130167
## PubDate.day.minutes.poly.2 -0.008758672
## PubDate.day.minutes.poly.2.ctg -0.189941446
## PubDate.day.minutes.poly.4 -0.018203392
## PubDate.day.minutes.poly.4.ctg -0.205244731
## PubDate.day.minutes.poly.5.ctg -0.280963223
## WordCount.log1p 1.609437912
## WordCount.root2 2.000000000
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet Y: max > max of Train range: 16"
## UniqueID Popular.fctr.All.X..rcv.glmnet PubDate.day.minutes.poly.1
## 1908 1908 Y 0.001882238
## 1922 1922 Y -0.002039521
## 5233 5233 Y -0.016600865
## 6528 6528 Y 0.001809613
## 1627 1627 Y 0.012231324
## 1906 1906 Y 0.002281677
## 302 302 Y 0.024722851
## 3770 3770 Y 0.001918551
## 4466 4466 Y 0.005586121
## 6435 6435 Y -0.013151170
## 1767 1767 Y 0.006784437
## 1923 1923 Y -0.002221084
## 1928 1928 Y -0.003709899
## 6517 6517 Y 0.020474279
## 3205 3205 Y 0.002898990
## 6521 6521 Y 0.013901702
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3.ctg
## 1908 -0.004571013 0.13906448
## 1922 0.001236262 -0.15016859
## 5233 0.007145007 0.01482341
## 6528 -0.004472838 -0.03222702
## 1627 -0.007282868 -0.13040699
## 1906 -0.005100600 0.03046261
## 302 0.051829879 0.01497441
## 3770 -0.004619890 0.43176828
## 4466 -0.008578107 0.03634546
## 6435 0.010843928 0.01771752
## 1767 -0.009309113 0.02489965
## 1923 0.001515129 0.08106078
## 1928 0.003766018 -0.12101585
## 6517 0.020825101 -0.01019734
## 3205 -0.005881110 0.01996895
## 6521 -0.004412350 -0.01394151
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4.ctg
## 1908 0.0071228891 -0.23421325
## 1922 0.0096415269 -0.09688160
## 5233 -0.0170009782 0.02271875
## 6528 0.0072191411 -0.01196576
## 1627 -0.0128746495 -0.11643110
## 1906 0.0065638297 -0.30249370
## 302 0.0661009370 0.01396653
## 3770 0.0070741317 0.45727441
## 4466 0.0003590042 0.11223265
## 6435 -0.0101400077 0.02178124
## 1767 -0.0023405093 0.33305455
## 1923 0.0096169766 0.21319110
## 1928 0.0089316619 -0.23544576
## 6517 0.0101361259 0.02716643
## 3205 0.0056054933 0.01405419
## 6521 -0.0138030620 -0.03721490
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5.ctg
## 1908 0.006785787 0.12227512
## 1922 -0.002098247 0.04932191
## 5233 0.013803383 -0.01404809
## 6528 0.006658270 0.02271226
## 1627 -0.006115494 0.30341253
## 1906 0.007446867 0.41093522
## 302 0.083442278 -0.02109223
## 3770 0.006848725 0.38140927
## 4466 0.009641107 -0.01748172
## 6435 -0.001891593 -0.02033356
## 1767 0.008727735 -0.13083307
## 1923 -0.002547032 -0.10732959
## 1928 -0.006056430 -0.21554955
## 6517 -0.004924177 0.01414065
## 3205 0.008323368 -0.02586218
## 6521 -0.012191759 -0.02427900
## PubDate.last16.log1p PubDate.last16.log1p.ctg PubDate.last2.log1p.ctg
## 1908 8.872627 0.00000 13.672085
## 1922 9.175852 15.75212 13.934985
## 5233 11.762734 11.82663 8.116417
## 6528 11.956983 12.49349 9.209840
## 1627 8.992806 0.00000 0.000000
## 1906 8.873748 0.00000 12.444822
## 302 10.077063 11.12598 7.814400
## 3770 8.888343 13.37261 10.528838
## 4466 8.716700 13.93166 13.659964
## 6435 10.338641 12.76144 10.337832
## 1767 8.591373 13.86545 12.657569
## 1923 9.173261 15.77251 14.253507
## 1928 9.283033 15.61660 13.669639
## 6517 11.448558 12.13813 11.371500
## 3205 11.850590 12.37754 8.836665
## 6521 11.685390 12.56239 9.680219
## PubDate.last32.log1p PubDate.last8.log1p WordCount.nexp
## 1908 9.718182 8.173857 1.994412e-151
## 1922 10.010547 8.276395 0.000000e+00
## 5233 11.819748 11.622461 5.482209e-194
## 6528 12.109529 11.425547 0.000000e+00
## 1627 9.504129 8.393216 0.000000e+00
## 1906 9.729253 8.075272 0.000000e+00
## 302 10.332897 9.741557 0.000000e+00
## 3770 9.938710 8.392537 3.418239e-166
## 4466 9.313619 7.827640 2.371872e-102
## 6435 10.866967 8.695172 1.000000e+00
## 1767 9.415401 7.891331 0.000000e+00
## 1923 10.018600 8.363109 3.071570e-202
## 1928 10.027959 8.536800 0.000000e+00
## 6517 12.217912 9.753188 2.750325e-314
## 3205 12.005436 11.443361 1.026188e-10
## 6521 12.196224 10.414633 0.000000e+00
## id cor.y
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.last16.log1p PubDate.last16.log1p 0.040735543
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.last8.log1p PubDate.last8.log1p 0.054458821
## WordCount.nexp WordCount.nexp -0.053208396
## exclude.as.feat cor.y.abs
## PubDate.day.minutes.poly.1 FALSE 0.156753478
## PubDate.day.minutes.poly.3 FALSE 0.027983551
## PubDate.day.minutes.poly.3.ctg FALSE 0.014982807
## PubDate.day.minutes.poly.4 FALSE 0.073941394
## PubDate.day.minutes.poly.4.ctg FALSE 0.014601521
## PubDate.day.minutes.poly.5 FALSE 0.055929231
## PubDate.day.minutes.poly.5.ctg FALSE 0.014574775
## PubDate.last16.log1p FALSE 0.040735543
## PubDate.last16.log1p.ctg FALSE 0.007783530
## PubDate.last2.log1p.ctg FALSE 0.006916600
## PubDate.last32.log1p FALSE 0.003558081
## PubDate.last8.log1p FALSE 0.054458821
## WordCount.nexp FALSE 0.053208396
## cor.high.X freqRatio percentUnique
## PubDate.day.minutes.poly.1 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.3 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.3.ctg <NA> 1.083333 53.96509
## PubDate.day.minutes.poly.4 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.4.ctg <NA> 1.083333 53.94979
## PubDate.day.minutes.poly.5 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.5.ctg <NA> 1.083333 53.94979
## PubDate.last16.log1p <NA> 3.200000 84.44581
## PubDate.last16.log1p.ctg <NA> 60.000000 95.17759
## PubDate.last2.log1p.ctg <NA> 5.000000 92.19228
## PubDate.last32.log1p <NA> 8.000000 90.99816
## PubDate.last8.log1p PubDate.last4.log1p 1.142857 75.12247
## WordCount.nexp <NA> 17.761364 11.32884
## zeroVar nzv is.cor.y.abs.low
## PubDate.day.minutes.poly.1 FALSE FALSE FALSE
## PubDate.day.minutes.poly.3 FALSE FALSE FALSE
## PubDate.day.minutes.poly.3.ctg FALSE FALSE FALSE
## PubDate.day.minutes.poly.4 FALSE FALSE FALSE
## PubDate.day.minutes.poly.4.ctg FALSE FALSE FALSE
## PubDate.day.minutes.poly.5 FALSE FALSE FALSE
## PubDate.day.minutes.poly.5.ctg FALSE FALSE FALSE
## PubDate.last16.log1p FALSE FALSE FALSE
## PubDate.last16.log1p.ctg FALSE FALSE TRUE
## PubDate.last2.log1p.ctg FALSE FALSE TRUE
## PubDate.last32.log1p FALSE FALSE TRUE
## PubDate.last8.log1p FALSE FALSE FALSE
## WordCount.nexp FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value
## PubDate.day.minutes.poly.1 <NA> 1.590362e-18
## PubDate.day.minutes.poly.3 <NA> 9.822405e-52
## PubDate.day.minutes.poly.3.ctg NDSSName.my.fctr 1.179915e-64
## PubDate.day.minutes.poly.4 <NA> 1.523136e-47
## PubDate.day.minutes.poly.4.ctg NDSSName.my.fctr 2.214419e-67
## PubDate.day.minutes.poly.5 <NA> 1.157500e-41
## PubDate.day.minutes.poly.5.ctg NDSSName.my.fctr 7.171204e-67
## PubDate.last16.log1p <NA> 7.310334e-68
## PubDate.last16.log1p.ctg NDSSName.my.fctr 6.216597e-76
## PubDate.last2.log1p.ctg NDSSName.my.fctr 1.991089e-37
## PubDate.last32.log1p <NA> 2.783236e-77
## PubDate.last8.log1p <NA> 3.859176e-56
## WordCount.nexp <NA> 9.108805e-94
## rsp_var_raw id_var rsp_var max
## PubDate.day.minutes.poly.1 FALSE NA NA 0.02475916
## PubDate.day.minutes.poly.3 FALSE NA NA 0.05215301
## PubDate.day.minutes.poly.3.ctg FALSE NA NA 0.56127224
## PubDate.day.minutes.poly.4 FALSE NA NA 0.06677441
## PubDate.day.minutes.poly.4.ctg FALSE NA NA 0.67700049
## PubDate.day.minutes.poly.5 FALSE NA NA 0.08471756
## PubDate.day.minutes.poly.5.ctg FALSE NA NA 0.56286316
## PubDate.last16.log1p FALSE NA NA 11.95698288
## PubDate.last16.log1p.ctg FALSE NA NA 15.77251197
## PubDate.last2.log1p.ctg FALSE NA NA 15.06116892
## PubDate.last32.log1p FALSE NA NA 12.32340669
## PubDate.last8.log1p FALSE NA NA 11.62246125
## WordCount.nexp FALSE NA NA 1.00000000
## min max.Popular.fctr.N
## PubDate.day.minutes.poly.1 -0.02749464 0.02468654
## PubDate.day.minutes.poly.3 -0.04512497 0.05150779
## PubDate.day.minutes.poly.3.ctg -0.66283168 0.55528441
## PubDate.day.minutes.poly.4 -0.01832740 0.06543120
## PubDate.day.minutes.poly.4.ctg -0.61188413 0.67700049
## PubDate.day.minutes.poly.5 -0.02450918 0.08217780
## PubDate.day.minutes.poly.5.ctg -0.71653445 0.56286316
## PubDate.last16.log1p 0.00000000 11.94531808
## PubDate.last16.log1p.ctg 0.00000000 15.72030254
## PubDate.last2.log1p.ctg 0.00000000 14.72999406
## PubDate.last32.log1p 0.00000000 12.21244232
## PubDate.last8.log1p 0.00000000 11.43577441
## WordCount.nexp 0.00000000 1.00000000
## max.Popular.fctr.Y min.Popular.fctr.N
## PubDate.day.minutes.poly.1 0.024468663 -0.02749464
## PubDate.day.minutes.poly.3 0.049597025 -0.04512497
## PubDate.day.minutes.poly.3.ctg 0.363266956 -0.66283168
## PubDate.day.minutes.poly.4 0.061490534 -0.01832740
## PubDate.day.minutes.poly.4.ctg 0.289618754 -0.61188413
## PubDate.day.minutes.poly.5 0.074814724 -0.02450918
## PubDate.day.minutes.poly.5.ctg 0.215852412 -0.71653445
## PubDate.last16.log1p 11.877603300 0.00000000
## PubDate.last16.log1p.ctg 15.629535143 0.00000000
## PubDate.last2.log1p.ctg 13.653551472 0.00000000
## PubDate.last32.log1p 12.178408497 0.00000000
## PubDate.last8.log1p 11.394288315 0.00000000
## WordCount.nexp 0.002478752 0.00000000
## min.Popular.fctr.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.3.ctg -0.38280444
## PubDate.day.minutes.poly.4 -0.01821959
## PubDate.day.minutes.poly.4.ctg -0.28243219
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.37058648
## PubDate.last16.log1p 0.00000000
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last8.log1p 0.00000000
## WordCount.nexp 0.00000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02450498
## PubDate.day.minutes.poly.3 0.04991290
## PubDate.day.minutes.poly.3.ctg 0.41228572
## PubDate.day.minutes.poly.4 0.06213811
## PubDate.day.minutes.poly.4.ctg 0.49508199
## PubDate.day.minutes.poly.5 0.07601554
## PubDate.day.minutes.poly.5.ctg 0.48962874
## PubDate.last16.log1p 11.84854019
## PubDate.last16.log1p.ctg 15.72589745
## PubDate.last2.log1p.ctg 14.07883590
## PubDate.last32.log1p 12.17383350
## PubDate.last8.log1p 11.40150216
## WordCount.nexp 1.00000000
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02472285
## PubDate.day.minutes.poly.3 0.05182988
## PubDate.day.minutes.poly.3.ctg 0.43176828
## PubDate.day.minutes.poly.4 0.06610094
## PubDate.day.minutes.poly.4.ctg 0.45727441
## PubDate.day.minutes.poly.5 0.08344228
## PubDate.day.minutes.poly.5.ctg 0.41093522
## PubDate.last16.log1p 11.95698288
## PubDate.last16.log1p.ctg 15.77251197
## PubDate.last2.log1p.ctg 14.25350675
## PubDate.last32.log1p 12.21791228
## PubDate.last8.log1p 11.62246125
## WordCount.nexp 1.00000000
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.3.ctg -0.39431764
## PubDate.day.minutes.poly.4 -0.01832685
## PubDate.day.minutes.poly.4.ctg -0.16387098
## PubDate.day.minutes.poly.5 -0.02450918
## PubDate.day.minutes.poly.5.ctg -0.21141841
## PubDate.last16.log1p 0.00000000
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last8.log1p 0.00000000
## WordCount.nexp 0.00000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.3.ctg -0.24211460
## PubDate.day.minutes.poly.4 -0.01824013
## PubDate.day.minutes.poly.4.ctg -0.51296095
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.54313226
## PubDate.last16.log1p 0.00000000
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last8.log1p 7.01929665
## WordCount.nexp 0.00000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02432341
## PubDate.day.minutes.poly.3 0.04834381
## PubDate.day.minutes.poly.3.ctg 0.56127224
## PubDate.day.minutes.poly.4 0.05893666
## PubDate.day.minutes.poly.4.ctg 0.63819571
## PubDate.day.minutes.poly.5 0.07011504
## PubDate.day.minutes.poly.5.ctg 0.45824974
## PubDate.last16.log1p 11.88113167
## PubDate.last16.log1p.ctg 15.68420514
## PubDate.last2.log1p.ctg 15.06116892
## PubDate.last32.log1p 12.32340669
## PubDate.last8.log1p 11.27955479
## WordCount.nexp 1.00000000
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02475916
## PubDate.day.minutes.poly.3 0.05215301
## PubDate.day.minutes.poly.3.ctg 0.34217266
## PubDate.day.minutes.poly.4 0.06677441
## PubDate.day.minutes.poly.4.ctg 0.38235412
## PubDate.day.minutes.poly.5 0.08471756
## PubDate.day.minutes.poly.5.ctg 0.42244492
## PubDate.last16.log1p 11.84328621
## PubDate.last16.log1p.ctg 15.67548989
## PubDate.last2.log1p.ctg 14.77515997
## PubDate.last32.log1p 12.30546086
## PubDate.last8.log1p 11.33227851
## WordCount.nexp 0.01831564
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.3.ctg -0.65503648
## PubDate.day.minutes.poly.4 -0.01832268
## PubDate.day.minutes.poly.4.ctg -0.23960642
## PubDate.day.minutes.poly.5 -0.02450918
## PubDate.day.minutes.poly.5.ctg -0.35475727
## PubDate.last16.log1p 8.10167775
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 8.86290830
## PubDate.last8.log1p 7.06133437
## WordCount.nexp 0.00000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.3.ctg -0.33854383
## PubDate.day.minutes.poly.4 -0.01820339
## PubDate.day.minutes.poly.4.ctg -0.20524473
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.28096322
## PubDate.last16.log1p 8.09223941
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 8.83579237
## PubDate.last8.log1p 6.89162590
## WordCount.nexp 0.00000000
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet N: min < min of Train range: 1"
## UniqueID Popular.fctr.All.X..rcv.glmnet
## 1833 1833 N
## PubDate.day.minutes.poly.1.ctg
## 1833 -0.7070114
## id cor.y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1.ctg NA 0.4812771 -0.7070114
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg 0.3596891 0.3295025
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg -0.3997769 -0.2457038
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.4812771
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.197731
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.7070114
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2842221
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.2260955
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.1964159
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.2546548
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2531302
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet N: max > max of Train range: 2"
## UniqueID Popular.fctr.All.X..rcv.glmnet
## 4402 4402 N
## 1924 1924 N
## PubDate.day.minutes.poly.1.ctg PubDate.last16.log1p.ctg
## 4402 0.48127714 0.0000
## 1924 0.04318327 15.7259
## id cor.y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## PubDate.last16.log1p.ctg FALSE 0.007783530 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## PubDate.last16.log1p.ctg 60.000000 95.17759 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## PubDate.last16.log1p.ctg TRUE NDSSName.my.fctr
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## PubDate.last16.log1p.ctg 6.216597e-76 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1.ctg NA 0.4812771 -0.7070114
## PubDate.last16.log1p.ctg NA 15.7725120 0.0000000
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg 0.3596891 0.3295025
## PubDate.last16.log1p.ctg 15.7203025 15.6295351
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg -0.3997769 -0.2457038
## PubDate.last16.log1p.ctg 0.0000000 0.0000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.4812771
## PubDate.last16.log1p.ctg 15.7258974
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.197731
## PubDate.last16.log1p.ctg 15.772512
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.7070114
## PubDate.last16.log1p.ctg 0.0000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2842221
## PubDate.last16.log1p.ctg 0.0000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.2260955
## PubDate.last16.log1p.ctg 15.6842051
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.1964159
## PubDate.last16.log1p.ctg 15.6754899
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.2546548
## PubDate.last16.log1p.ctg 0.0000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2531302
## PubDate.last16.log1p.ctg 0.0000000
## [1] "OOBobs total range outliers: 25"
## [1] "newobs Popular.fctr.Final..rcv.glmnet N: max > max of Train range: 1186"
## UniqueID Popular.fctr.Final..rcv.glmnet
## 6533 6533 N
## 6540 6540 N
## 6541 6541 N
## 6542 6542 N
## 6543 6543 N
## 6545 6545 N
## PubDate.day.minutes.poly.3.ctg PubDate.juliandate
## 6533 0.01381696 335
## 6540 0.02736795 335
## 6541 0.03472372 335
## 6542 0.03704916 335
## 6543 0.02491822 335
## 6545 -0.03157315 335
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 6533 11.020840 10.134321 13.09830
## 6540 8.212026 9.418817 13.12125
## 6541 10.856592 9.442800 13.12686
## 6542 11.034890 9.391411 13.12590
## 6543 9.618070 9.400878 12.81628
## 6545 9.879502 9.402860 12.99054
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg
## 6533 11.253274 11.89583
## 6540 11.092611 11.26715
## 6541 11.076186 11.28744
## 6542 11.184338 11.82491
## 6543 10.450423 11.38482
## 6545 9.953134 11.38843
## UniqueID Popular.fctr.Final..rcv.glmnet
## 6629 6629 N
## 7478 7478 N
## 7575 7575 N
## 7720 7720 N
## 7907 7907 N
## 8236 8236 N
## PubDate.day.minutes.poly.3.ctg PubDate.juliandate
## 6629 0.04262694 335
## 7478 0.16415852 346
## 7575 0.03000273 349
## 7720 -0.00890592 351
## 7907 -0.02598039 352
## 8236 0.04836251 360
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 6629 10.979513 12.219719 12.91549
## 7478 8.575651 10.902777 11.77183
## 7575 9.105202 10.133924 12.38414
## 7720 10.897017 9.276783 12.05991
## 7907 9.109967 10.825760 12.57079
## 8236 11.370474 11.729246 0.00000
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg
## 6629 11.077377 11.770316
## 7478 9.006632 9.574706
## 7575 10.977056 11.092778
## 7720 10.929888 11.051398
## 7907 9.695664 11.335185
## 8236 12.709290 13.274841
## UniqueID Popular.fctr.Final..rcv.glmnet
## 8394 8394 N
## 8395 8395 N
## 8396 8396 N
## 8398 8398 N
## 8400 8400 N
## 8401 8401 N
## PubDate.day.minutes.poly.3.ctg PubDate.juliandate
## 8394 -0.2772583 365
## 8395 0.1129868 365
## 8396 0.1587658 365
## 8398 -0.1431270 365
## 8400 -0.1380997 365
## 8401 0.1926449 365
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 8394 13.13739 11.14224 15.14526
## 8395 11.44674 11.14428 15.14071
## 8396 13.98310 11.11651 15.39121
## 8398 6.43294 11.09410 15.90984
## 8400 15.06117 11.07855 15.92029
## 8401 11.51832 11.05398 15.91539
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg
## 8394 13.41328 13.96451
## 8395 13.30663 13.83971
## 8396 14.06262 14.57832
## 8398 15.06135 15.37495
## 8400 15.35403 15.48083
## 8401 14.81295 15.38483
## id cor.y
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## exclude.as.feat cor.y.abs
## PubDate.day.minutes.poly.3.ctg FALSE 0.014982807
## PubDate.juliandate FALSE 0.014361075
## PubDate.last2.log1p.ctg FALSE 0.006916600
## PubDate.last32.log1p FALSE 0.003558081
## PubDate.last32.log1p.ctg FALSE 0.015395971
## PubDate.last4.log1p.ctg FALSE 0.004792781
## PubDate.last8.log1p.ctg FALSE 0.003914960
## cor.high.X freqRatio percentUnique
## PubDate.day.minutes.poly.3.ctg <NA> 1.083333 53.965095
## PubDate.juliandate PubDate.month.fctr 1.032520 1.393141
## PubDate.last2.log1p.ctg <NA> 5.000000 92.192284
## PubDate.last32.log1p <NA> 8.000000 90.998163
## PubDate.last32.log1p.ctg <NA> 239.000000 92.115738
## PubDate.last4.log1p.ctg <NA> 20.000000 95.881813
## PubDate.last8.log1p.ctg <NA> 40.000000 96.417636
## zeroVar nzv is.cor.y.abs.low
## PubDate.day.minutes.poly.3.ctg FALSE FALSE FALSE
## PubDate.juliandate FALSE FALSE FALSE
## PubDate.last2.log1p.ctg FALSE FALSE TRUE
## PubDate.last32.log1p FALSE FALSE TRUE
## PubDate.last32.log1p.ctg FALSE FALSE FALSE
## PubDate.last4.log1p.ctg FALSE FALSE TRUE
## PubDate.last8.log1p.ctg FALSE FALSE TRUE
## interaction.feat shapiro.test.p.value
## PubDate.day.minutes.poly.3.ctg NDSSName.my.fctr 1.179915e-64
## PubDate.juliandate <NA> 1.389406e-35
## PubDate.last2.log1p.ctg NDSSName.my.fctr 1.991089e-37
## PubDate.last32.log1p <NA> 2.783236e-77
## PubDate.last32.log1p.ctg NDSSName.my.fctr 1.647772e-78
## PubDate.last4.log1p.ctg NDSSName.my.fctr 5.833827e-54
## PubDate.last8.log1p.ctg NDSSName.my.fctr 2.241558e-67
## rsp_var_raw id_var rsp_var max
## PubDate.day.minutes.poly.3.ctg FALSE NA NA 0.5612722
## PubDate.juliandate FALSE NA NA 365.0000000
## PubDate.last2.log1p.ctg FALSE NA NA 15.0611689
## PubDate.last32.log1p FALSE NA NA 12.3234067
## PubDate.last32.log1p.ctg FALSE NA NA 15.9202866
## PubDate.last4.log1p.ctg FALSE NA NA 15.3540272
## PubDate.last8.log1p.ctg FALSE NA NA 15.4808349
## min max.Popular.fctr.N
## PubDate.day.minutes.poly.3.ctg -0.6628317 0.5552844
## PubDate.juliandate 244.0000000 334.0000000
## PubDate.last2.log1p.ctg 0.0000000 14.7299941
## PubDate.last32.log1p 0.0000000 12.2124423
## PubDate.last32.log1p.ctg 0.0000000 15.3192168
## PubDate.last4.log1p.ctg 0.0000000 14.6582245
## PubDate.last8.log1p.ctg 0.0000000 15.1997598
## max.Popular.fctr.Y min.Popular.fctr.N
## PubDate.day.minutes.poly.3.ctg 0.363267 -0.6628317
## PubDate.juliandate 334.000000 244.0000000
## PubDate.last2.log1p.ctg 14.253507 0.0000000
## PubDate.last32.log1p 12.217912 0.0000000
## PubDate.last32.log1p.ctg 15.300332 0.0000000
## PubDate.last4.log1p.ctg 14.733865 0.0000000
## PubDate.last8.log1p.ctg 15.122845 0.0000000
## min.Popular.fctr.Y
## PubDate.day.minutes.poly.3.ctg -0.3828044
## PubDate.juliandate 244.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 0.0000000
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg 0.4122857
## PubDate.juliandate 332.0000000
## PubDate.last2.log1p.ctg 14.0788359
## PubDate.last32.log1p 12.1738335
## PubDate.last32.log1p.ctg 15.2783647
## PubDate.last4.log1p.ctg 14.5891468
## PubDate.last8.log1p.ctg 15.1063837
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg 0.4317683
## PubDate.juliandate 334.0000000
## PubDate.last2.log1p.ctg 14.2535067
## PubDate.last32.log1p 12.2179123
## PubDate.last32.log1p.ctg 15.1773142
## PubDate.last4.log1p.ctg 14.5794661
## PubDate.last8.log1p.ctg 15.0339077
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg -0.3943176
## PubDate.juliandate 244.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 0.0000000
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg -0.2421146
## PubDate.juliandate 244.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 0.0000000
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg 0.5612722
## PubDate.juliandate 365.0000000
## PubDate.last2.log1p.ctg 15.0611689
## PubDate.last32.log1p 12.3234067
## PubDate.last32.log1p.ctg 15.9202866
## PubDate.last4.log1p.ctg 15.3540272
## PubDate.last8.log1p.ctg 15.4808349
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg 0.3421727
## PubDate.juliandate 365.0000000
## PubDate.last2.log1p.ctg 14.7751600
## PubDate.last32.log1p 12.3054609
## PubDate.last32.log1p.ctg 15.9193719
## PubDate.last4.log1p.ctg 15.3435742
## PubDate.last8.log1p.ctg 15.4661689
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg -0.6550365
## PubDate.juliandate 335.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 8.8629083
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg -0.3385438
## PubDate.juliandate 335.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 8.8357924
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## [1] "newobs Popular.fctr.Final..rcv.glmnet Y: min < min of Train range: 3"
## UniqueID Popular.fctr.Final..rcv.glmnet
## 8217 8217 Y
## 8360 8360 Y
## 8375 8375 Y
## PubDate.day.minutes.poly.1.ctg WordCount.log1p WordCount.root2
## 8217 0.01104094 1.609438 2.00000
## 8360 -0.25313017 7.202661 36.63332
## 8375 -0.25313017 6.274762 23.02173
## id cor.y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## WordCount.log1p WordCount.log1p 0.254319628
## WordCount.root2 WordCount.root2 0.292120679
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## WordCount.log1p FALSE 0.254319628 WordCount.root2
## WordCount.root2 FALSE 0.292120679 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## WordCount.log1p 2.315789 24.15799 FALSE FALSE
## WordCount.root2 2.315789 24.15799 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## WordCount.log1p FALSE <NA>
## WordCount.root2 FALSE <NA>
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## WordCount.log1p 1.576866e-49 FALSE NA
## WordCount.root2 4.556481e-30 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1.ctg NA 0.4812771 -0.7070114
## WordCount.log1p NA 9.2977100 0.0000000
## WordCount.root2 NA 104.4605189 0.0000000
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg 0.4812771 0.3295025
## WordCount.log1p 8.8196653 9.2977100
## WordCount.root2 82.2496201 104.4605189
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg -0.7070114 -0.2457038
## WordCount.log1p 0.0000000 1.9459101
## WordCount.root2 0.0000000 2.4494897
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.4812771
## WordCount.log1p 7.0596176
## WordCount.root2 34.1027858
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.197731
## WordCount.log1p 9.140883
## WordCount.root2 96.581572
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.7070114
## WordCount.log1p 0.0000000
## WordCount.root2 0.0000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2842221
## WordCount.log1p 0.0000000
## WordCount.root2 0.0000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.2260955
## WordCount.log1p 7.9409398
## WordCount.root2 53.0000000
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.1964159
## WordCount.log1p 8.6923223
## WordCount.root2 77.1751255
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.2546548
## WordCount.log1p 0.0000000
## WordCount.root2 0.0000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2531302
## WordCount.log1p 1.6094379
## WordCount.root2 2.0000000
## [1] "newobs Popular.fctr.Final..rcv.glmnet Y: max > max of Train range: 684"
## UniqueID Popular.fctr.Final..rcv.glmnet PubDate.day.minutes.poly.1
## 6534 6534 Y 0.02047428
## 6535 6535 Y 0.02043797
## 6536 6536 Y 0.01840446
## 6537 6537 Y 0.01437377
## 6538 6538 Y 0.01408327
## 6539 6539 Y 0.01390170
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.4
## 6534 0.020825101 0.010136126
## 6535 0.020614888 0.009828175
## 6536 0.010192448 -0.003724687
## 6537 -0.003371562 -0.013710045
## 6538 -0.004024505 -0.013788116
## 6539 -0.004412350 -0.013803062
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.5
## 6534 -0.026117829 -0.004924177
## 6535 -0.032426274 -0.005269034
## 6536 0.002579915 -0.017201046
## 6537 0.024407410 -0.013755146
## 6538 0.024419023 -0.012806242
## 6539 0.010731259 -0.012191759
## PubDate.day.minutes.poly.5.ctg PubDate.juliandate
## 6534 -0.04109512 335
## 6535 -0.03723150 335
## 6536 0.02875121 335
## 6537 0.01925208 335
## 6538 -0.01676509 335
## 6539 -0.02825414 335
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 6534 9.639001 10.036094 13.03463
## 6535 9.896664 10.053458 13.04093
## 6536 9.540219 9.939434 13.01641
## 6537 9.362546 9.564863 13.10988
## 6538 8.234830 9.542733 13.09908
## 6539 7.949797 9.572898 13.11828
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg WordCount.nexp
## 6534 10.283942 10.51086 4.609768e-243
## 6535 10.357965 11.36460 0.000000e+00
## 6536 9.780020 11.33092 0.000000e+00
## 6537 9.580386 11.27835 3.128062e-93
## 6538 8.916506 11.28285 0.000000e+00
## 6539 10.909784 11.26846 0.000000e+00
## UniqueID Popular.fctr.Final..rcv.glmnet PubDate.day.minutes.poly.1
## 6671 6671 Y 0.002463239
## 6873 6873 Y -0.004799277
## 7317 7317 Y 0.012231324
## 7528 7528 Y 0.020437967
## 7734 7734 Y 0.007002312
## 8145 8145 Y -0.010355102
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.4
## 6671 -0.005335173 0.006293548
## 6873 0.005333956 0.007895719
## 7317 -0.007282868 -0.012874650
## 7528 0.020614888 0.009828175
## 7734 -0.009404459 -0.002840648
## 8145 0.010795979 -0.003014347
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.5
## 6671 -0.036401397 0.007723689
## 6873 -0.029511027 -0.008268638
## 7317 -0.026614814 -0.006115494
## 7528 -0.007699323 -0.005269034
## 7734 0.024333713 0.008456902
## 8145 -0.027871354 -0.009591936
## PubDate.day.minutes.poly.5.ctg PubDate.juliandate
## 6671 -0.01480682 336
## 6873 0.03660486 338
## 7317 -0.17779205 345
## 7528 -0.03280407 349
## 7734 0.02195277 351
## 8145 0.02662100 357
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 6671 9.562616 9.941120 13.21616
## 6873 9.367344 10.918446 13.90708
## 7317 12.057323 9.800014 14.21978
## 7528 10.838070 10.284797 12.02836
## 7734 8.746557 9.242904 12.09562
## 8145 10.961243 11.002700 12.75301
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg WordCount.nexp
## 6671 10.27343 12.05653 0.000000e+00
## 6873 11.32395 12.56299 2.032231e-313
## 7317 12.56482 12.83635 2.329036e-211
## 7528 10.87097 11.07915 2.699143e-152
## 7734 11.25629 11.41736 5.709040e-171
## 8145 11.13282 11.87738 0.000000e+00
## UniqueID Popular.fctr.Final..rcv.glmnet PubDate.day.minutes.poly.1
## 8386 8386 Y -0.004762964
## 8391 8391 Y -0.007559033
## 8392 8392 Y -0.007885846
## 8397 8397 Y -0.012134418
## 8399 8399 Y -0.014204235
## 8402 8402 Y -0.027458327
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.4
## 8386 0.005283252 0.007937277
## 8391 0.008724731 0.003454984
## 8392 0.009050092 0.002780751
## 8397 0.011101808 -0.007579609
## 8399 0.010203360 -0.012625915
## 8402 -0.044820236 0.033658267
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.5
## 8386 -0.194219443 -0.008201720
## 8391 -0.129369992 -0.011437989
## 8392 0.019130981 -0.011513182
## 8397 0.014839874 -0.005346258
## 8399 0.021396462 0.002370447
## 8402 0.001133258 -0.023627798
## PubDate.day.minutes.poly.5.ctg PubDate.juliandate
## 8386 0.12356866 365
## 8391 -0.06697667 365
## 8392 0.15930790 365
## 8397 -0.20117350 365
## 8399 0.16099960 365
## 8402 -0.19528592 365
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 8386 12.64880 11.19043 0.00000
## 8391 11.99075 11.15722 0.00000
## 8392 13.18411 11.15937 15.18202
## 8397 14.50789 11.11095 15.53790
## 8399 14.77516 11.10274 15.91937
## 8402 11.93860 10.79296 15.91787
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg WordCount.nexp
## 8386 13.40155 13.98196 0
## 8391 12.95591 14.00877 0
## 8392 13.66929 14.13371 0
## 8397 14.55366 14.83106 0
## 8399 15.34357 15.46617 0
## 8402 11.94265 15.38649 0
## id cor.y
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## WordCount.nexp WordCount.nexp -0.053208396
## exclude.as.feat cor.y.abs
## PubDate.day.minutes.poly.1 FALSE 0.156753478
## PubDate.day.minutes.poly.3 FALSE 0.027983551
## PubDate.day.minutes.poly.4 FALSE 0.073941394
## PubDate.day.minutes.poly.4.ctg FALSE 0.014601521
## PubDate.day.minutes.poly.5 FALSE 0.055929231
## PubDate.day.minutes.poly.5.ctg FALSE 0.014574775
## PubDate.juliandate FALSE 0.014361075
## PubDate.last2.log1p.ctg FALSE 0.006916600
## PubDate.last32.log1p FALSE 0.003558081
## PubDate.last32.log1p.ctg FALSE 0.015395971
## PubDate.last4.log1p.ctg FALSE 0.004792781
## PubDate.last8.log1p.ctg FALSE 0.003914960
## WordCount.nexp FALSE 0.053208396
## cor.high.X freqRatio percentUnique
## PubDate.day.minutes.poly.1 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.3 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.4 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.4.ctg <NA> 1.083333 53.949786
## PubDate.day.minutes.poly.5 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.5.ctg <NA> 1.083333 53.949786
## PubDate.juliandate PubDate.month.fctr 1.032520 1.393141
## PubDate.last2.log1p.ctg <NA> 5.000000 92.192284
## PubDate.last32.log1p <NA> 8.000000 90.998163
## PubDate.last32.log1p.ctg <NA> 239.000000 92.115738
## PubDate.last4.log1p.ctg <NA> 20.000000 95.881813
## PubDate.last8.log1p.ctg <NA> 40.000000 96.417636
## WordCount.nexp <NA> 17.761364 11.328843
## zeroVar nzv is.cor.y.abs.low
## PubDate.day.minutes.poly.1 FALSE FALSE FALSE
## PubDate.day.minutes.poly.3 FALSE FALSE FALSE
## PubDate.day.minutes.poly.4 FALSE FALSE FALSE
## PubDate.day.minutes.poly.4.ctg FALSE FALSE FALSE
## PubDate.day.minutes.poly.5 FALSE FALSE FALSE
## PubDate.day.minutes.poly.5.ctg FALSE FALSE FALSE
## PubDate.juliandate FALSE FALSE FALSE
## PubDate.last2.log1p.ctg FALSE FALSE TRUE
## PubDate.last32.log1p FALSE FALSE TRUE
## PubDate.last32.log1p.ctg FALSE FALSE FALSE
## PubDate.last4.log1p.ctg FALSE FALSE TRUE
## PubDate.last8.log1p.ctg FALSE FALSE TRUE
## WordCount.nexp FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value
## PubDate.day.minutes.poly.1 <NA> 1.590362e-18
## PubDate.day.minutes.poly.3 <NA> 9.822405e-52
## PubDate.day.minutes.poly.4 <NA> 1.523136e-47
## PubDate.day.minutes.poly.4.ctg NDSSName.my.fctr 2.214419e-67
## PubDate.day.minutes.poly.5 <NA> 1.157500e-41
## PubDate.day.minutes.poly.5.ctg NDSSName.my.fctr 7.171204e-67
## PubDate.juliandate <NA> 1.389406e-35
## PubDate.last2.log1p.ctg NDSSName.my.fctr 1.991089e-37
## PubDate.last32.log1p <NA> 2.783236e-77
## PubDate.last32.log1p.ctg NDSSName.my.fctr 1.647772e-78
## PubDate.last4.log1p.ctg NDSSName.my.fctr 5.833827e-54
## PubDate.last8.log1p.ctg NDSSName.my.fctr 2.241558e-67
## WordCount.nexp <NA> 9.108805e-94
## rsp_var_raw id_var rsp_var max
## PubDate.day.minutes.poly.1 FALSE NA NA 0.02475916
## PubDate.day.minutes.poly.3 FALSE NA NA 0.05215301
## PubDate.day.minutes.poly.4 FALSE NA NA 0.06677441
## PubDate.day.minutes.poly.4.ctg FALSE NA NA 0.67700049
## PubDate.day.minutes.poly.5 FALSE NA NA 0.08471756
## PubDate.day.minutes.poly.5.ctg FALSE NA NA 0.56286316
## PubDate.juliandate FALSE NA NA 365.00000000
## PubDate.last2.log1p.ctg FALSE NA NA 15.06116892
## PubDate.last32.log1p FALSE NA NA 12.32340669
## PubDate.last32.log1p.ctg FALSE NA NA 15.92028658
## PubDate.last4.log1p.ctg FALSE NA NA 15.35402717
## PubDate.last8.log1p.ctg FALSE NA NA 15.48083492
## WordCount.nexp FALSE NA NA 1.00000000
## min max.Popular.fctr.N
## PubDate.day.minutes.poly.1 -0.02749464 0.02472285
## PubDate.day.minutes.poly.3 -0.04512497 0.05182988
## PubDate.day.minutes.poly.4 -0.01832740 0.06610094
## PubDate.day.minutes.poly.4.ctg -0.61188413 0.67700049
## PubDate.day.minutes.poly.5 -0.02450918 0.08344228
## PubDate.day.minutes.poly.5.ctg -0.71653445 0.56286316
## PubDate.juliandate 244.00000000 334.00000000
## PubDate.last2.log1p.ctg 0.00000000 14.72999406
## PubDate.last32.log1p 0.00000000 12.21244232
## PubDate.last32.log1p.ctg 0.00000000 15.31921677
## PubDate.last4.log1p.ctg 0.00000000 14.65822450
## PubDate.last8.log1p.ctg 0.00000000 15.19975983
## WordCount.nexp 0.00000000 1.00000000
## max.Popular.fctr.Y min.Popular.fctr.N
## PubDate.day.minutes.poly.1 2.446866e-02 -0.02749464
## PubDate.day.minutes.poly.3 4.959703e-02 -0.04512497
## PubDate.day.minutes.poly.4 6.149053e-02 -0.01832740
## PubDate.day.minutes.poly.4.ctg 3.330545e-01 -0.61188413
## PubDate.day.minutes.poly.5 7.481472e-02 -0.02450918
## PubDate.day.minutes.poly.5.ctg 2.158524e-01 -0.71653445
## PubDate.juliandate 3.340000e+02 244.00000000
## PubDate.last2.log1p.ctg 1.425351e+01 0.00000000
## PubDate.last32.log1p 1.221791e+01 0.00000000
## PubDate.last32.log1p.ctg 1.530033e+01 0.00000000
## PubDate.last4.log1p.ctg 1.473386e+01 0.00000000
## PubDate.last8.log1p.ctg 1.512285e+01 0.00000000
## WordCount.nexp 2.478752e-03 0.00000000
## min.Popular.fctr.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.4 -0.01821959
## PubDate.day.minutes.poly.4.ctg -0.28243219
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.37058648
## PubDate.juliandate 244.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last32.log1p.ctg 0.00000000
## PubDate.last4.log1p.ctg 0.00000000
## PubDate.last8.log1p.ctg 0.00000000
## WordCount.nexp 0.00000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02450498
## PubDate.day.minutes.poly.3 0.04991290
## PubDate.day.minutes.poly.4 0.06213811
## PubDate.day.minutes.poly.4.ctg 0.49508199
## PubDate.day.minutes.poly.5 0.07601554
## PubDate.day.minutes.poly.5.ctg 0.48962874
## PubDate.juliandate 332.00000000
## PubDate.last2.log1p.ctg 14.07883590
## PubDate.last32.log1p 12.17383350
## PubDate.last32.log1p.ctg 15.27836472
## PubDate.last4.log1p.ctg 14.58914681
## PubDate.last8.log1p.ctg 15.10638373
## WordCount.nexp 1.00000000
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02472285
## PubDate.day.minutes.poly.3 0.05182988
## PubDate.day.minutes.poly.4 0.06610094
## PubDate.day.minutes.poly.4.ctg 0.45727441
## PubDate.day.minutes.poly.5 0.08344228
## PubDate.day.minutes.poly.5.ctg 0.41093522
## PubDate.juliandate 334.00000000
## PubDate.last2.log1p.ctg 14.25350675
## PubDate.last32.log1p 12.21791228
## PubDate.last32.log1p.ctg 15.17731420
## PubDate.last4.log1p.ctg 14.57946609
## PubDate.last8.log1p.ctg 15.03390773
## WordCount.nexp 1.00000000
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.4 -0.01832685
## PubDate.day.minutes.poly.4.ctg -0.16387098
## PubDate.day.minutes.poly.5 -0.02450918
## PubDate.day.minutes.poly.5.ctg -0.21141841
## PubDate.juliandate 244.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last32.log1p.ctg 0.00000000
## PubDate.last4.log1p.ctg 0.00000000
## PubDate.last8.log1p.ctg 0.00000000
## WordCount.nexp 0.00000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.4 -0.01824013
## PubDate.day.minutes.poly.4.ctg -0.51296095
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.54313226
## PubDate.juliandate 244.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last32.log1p.ctg 0.00000000
## PubDate.last4.log1p.ctg 0.00000000
## PubDate.last8.log1p.ctg 0.00000000
## WordCount.nexp 0.00000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02432341
## PubDate.day.minutes.poly.3 0.04834381
## PubDate.day.minutes.poly.4 0.05893666
## PubDate.day.minutes.poly.4.ctg 0.63819571
## PubDate.day.minutes.poly.5 0.07011504
## PubDate.day.minutes.poly.5.ctg 0.45824974
## PubDate.juliandate 365.00000000
## PubDate.last2.log1p.ctg 15.06116892
## PubDate.last32.log1p 12.32340669
## PubDate.last32.log1p.ctg 15.92028658
## PubDate.last4.log1p.ctg 15.35402717
## PubDate.last8.log1p.ctg 15.48083492
## WordCount.nexp 1.00000000
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02475916
## PubDate.day.minutes.poly.3 0.05215301
## PubDate.day.minutes.poly.4 0.06677441
## PubDate.day.minutes.poly.4.ctg 0.38235412
## PubDate.day.minutes.poly.5 0.08471756
## PubDate.day.minutes.poly.5.ctg 0.42244492
## PubDate.juliandate 365.00000000
## PubDate.last2.log1p.ctg 14.77515997
## PubDate.last32.log1p 12.30546086
## PubDate.last32.log1p.ctg 15.91937187
## PubDate.last4.log1p.ctg 15.34357419
## PubDate.last8.log1p.ctg 15.46616891
## WordCount.nexp 0.01831564
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.4 -0.01832268
## PubDate.day.minutes.poly.4.ctg -0.23960642
## PubDate.day.minutes.poly.5 -0.02450918
## PubDate.day.minutes.poly.5.ctg -0.35475727
## PubDate.juliandate 335.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 8.86290830
## PubDate.last32.log1p.ctg 0.00000000
## PubDate.last4.log1p.ctg 0.00000000
## PubDate.last8.log1p.ctg 0.00000000
## WordCount.nexp 0.00000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.4 -0.01820339
## PubDate.day.minutes.poly.4.ctg -0.20524473
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.28096322
## PubDate.juliandate 335.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 8.83579237
## PubDate.last32.log1p.ctg 0.00000000
## PubDate.last4.log1p.ctg 0.00000000
## PubDate.last8.log1p.ctg 0.00000000
## WordCount.nexp 0.00000000
## [1] "newobs total range outliers: 1870"
## numeric(0)
## [1] "glb_sel_mdl_id: All.X##rcv#glmnet"
## [1] "glb_fin_mdl_id: Final##rcv#glmnet"
## [1] "Cross Validation issues:"
## MFO###myMFO_classfr Random###myrandom_classfr
## 0 0
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1.cp.0###rpart
## 0 0
## max.Accuracy.OOB max.AUCROCR.OOB
## Max.cor.Y##rcv#rpart 0.8200231 0.5892132
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7818287 0.8024758
## Low.cor.X##rcv#glmnet 0.7783565 0.8052766
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7754630 0.7997373
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7673611 0.7773858
## Interact.High.cor.Y##rcv#glmnet 0.7656250 0.8140971
## Max.cor.Y.rcv.1X1###glmnet 0.7604167 0.8116126
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7575231 0.8067975
## All.X##rcv#glmnet 0.6250000 0.8158791
## MFO###myMFO_classfr 0.1331019 0.5000000
## Random###myrandom_classfr 0.1331019 0.4857956
## Final##rcv#glmnet NA NA
## max.AUCpROC.OOB max.Accuracy.fit
## Max.cor.Y##rcv#rpart 0.5870523 0.9296422
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9279769
## Low.cor.X##rcv#glmnet 0.5917252 0.9276303
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9319320
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9381765
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9315850
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9329725
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9333905
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9331818
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9331816
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9335973
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9333193
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9332218
## All.X##rcv#glmnet 0.5873513 0.9261737
## MFO###myMFO_classfr 0.5000000 0.1796420
## Random###myrandom_classfr 0.5125675 0.1796420
## Final##rcv#glmnet NA 0.9065624
## opt.prob.threshold.fit
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2
## Low.cor.X##rcv#glmnet 0.2
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4
## Interact.High.cor.Y##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1###glmnet 0.5
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4
## All.X##rcv#glmnet 0.3
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
## Final##rcv#glmnet 0.2
## opt.prob.threshold.OOB
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1
## Low.cor.X##rcv#glmnet 0.1
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.1
## Interact.High.cor.Y##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.1
## All.X##rcv#glmnet 0.1
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
## Final##rcv#glmnet NA
## [1] "All.X##rcv#glmnet OOB confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 874 624
## Y 24 206
## .freqRatio.Fit .freqRatio.OOB
## OpEd#Opinion# 0.090965862 0.0515046296
## #Opinion#ThePublicEditor 0.003330558 0.0023148148
## Styles#U.S.# 0.026436303 0.0289351852
## Business#Crosswords/Games# 0.021856786 0.0104166667
## Science#Health# 0.030807660 0.0277777778
## Business#Technology# 0.044338052 0.0729166667
## ## 0.190049958 0.2146990741
## Business#BusinessDay#Dealbook 0.130932556 0.1869212963
## Metro#N.Y./Region# 0.026644463 0.0405092593
## Culture#Arts# 0.101998335 0.1070601852
## #Opinion#RoomForDebate 0.008742714 0.0115740741
## Styles##Fashion 0.021648626 0.0086805556
## Business#BusinessDay#SmallBusiness 0.020815987 0.0231481481
## myOther 0.006869276 0.0028935185
## Travel#Travel# 0.017277269 0.0196759259
## Culture## NA 0.0005787037
## Foreign#World#AsiaPacific 0.031223980 0.0306712963
## #Multimedia# 0.019150708 0.0283564815
## TStyle## 0.129683597 0.0584490741
## #U.S.#Education 0.050582848 0.0474537037
## Foreign#World# 0.026644463 0.0254629630
## .freqRatio.Tst .n.Fit .n.New.N .n.New.Y
## OpEd#Opinion# 0.087700535 437 NA 164
## #Opinion#ThePublicEditor 0.005347594 16 NA 10
## Styles#U.S.# 0.032620321 127 NA 61
## Business#Crosswords/Games# 0.022459893 105 NA 42
## Science#Health# 0.030481283 148 NA 57
## Business#Technology# 0.060962567 213 34 80
## ## 0.182887701 913 265 77
## Business#BusinessDay#Dealbook 0.162566845 629 197 107
## Metro#N.Y./Region# 0.035828877 128 37 30
## Culture#Arts# 0.093048128 490 157 17
## #Opinion#RoomForDebate 0.010695187 42 19 1
## Styles##Fashion 0.008021390 104 15 NA
## Business#BusinessDay#SmallBusiness 0.021925134 100 35 6
## myOther 0.002673797 33 5 NA
## Travel#Travel# 0.018716578 83 35 NA
## Culture## 0.037433155 NA 48 22
## Foreign#World#AsiaPacific 0.029946524 150 49 7
## #Multimedia# 0.027807487 92 52 NA
## TStyle## 0.056149733 623 104 1
## #U.S.#Education 0.047593583 243 87 2
## Foreign#World# 0.025133690 128 47 NA
## .n.OOB .n.Trn.N .n.Trn.Y .n.Tst .n.fit
## OpEd#Opinion# 89 117 409 164 437
## #Opinion#ThePublicEditor 4 4 16 10 16
## Styles#U.S.# 50 77 100 61 127
## Business#Crosswords/Games# 18 20 103 42 105
## Science#Health# 48 74 122 57 148
## Business#Technology# 126 288 51 114 213
## ## 371 1169 115 342 913
## Business#BusinessDay#Dealbook 323 864 88 304 629
## Metro#N.Y./Region# 70 181 17 67 128
## Culture#Arts# 185 625 50 174 490
## #Opinion#RoomForDebate 20 61 1 20 42
## Styles##Fashion 15 118 1 15 104
## Business#BusinessDay#SmallBusiness 40 135 5 41 100
## myOther 5 38 NA 5 33
## Travel#Travel# 34 116 1 35 83
## Culture## 1 1 NA 70 NA
## Foreign#World#AsiaPacific 53 200 3 56 150
## #Multimedia# 49 139 2 52 92
## TStyle## 101 715 9 105 623
## #U.S.#Education 82 325 NA 89 243
## Foreign#World# 44 172 NA 47 128
## .n.new .n.trn err.abs.OOB.mean
## OpEd#Opinion# 164 526 0.52321102
## #Opinion#ThePublicEditor 10 20 0.50239104
## Styles#U.S.# 61 177 0.47246602
## Business#Crosswords/Games# 42 123 0.46561615
## Science#Health# 57 196 0.46139079
## Business#Technology# 114 339 0.23595334
## ## 342 1284 0.21034956
## Business#BusinessDay#Dealbook 304 952 0.20490967
## Metro#N.Y./Region# 67 198 0.19235205
## Culture#Arts# 174 675 0.18743908
## #Opinion#RoomForDebate 20 62 0.18687364
## Styles##Fashion 15 119 0.14401821
## Business#BusinessDay#SmallBusiness 41 140 0.14071469
## myOther 5 38 0.11352091
## Travel#Travel# 35 117 0.10666841
## Culture## 70 1 0.10273457
## Foreign#World#AsiaPacific 56 203 0.10074984
## #Multimedia# 52 141 0.09873497
## TStyle## 105 724 0.09416257
## #U.S.#Education 89 325 0.07340342
## Foreign#World# 47 172 0.07265096
## err.abs.fit.mean err.abs.new.mean
## OpEd#Opinion# 0.38890528 NA
## #Opinion#ThePublicEditor 0.45429959 NA
## Styles#U.S.# 0.49067778 NA
## Business#Crosswords/Games# 0.35876063 NA
## Science#Health# 0.45459002 NA
## Business#Technology# 0.21997010 NA
## ## 0.14550925 NA
## Business#BusinessDay#Dealbook 0.15389806 NA
## Metro#N.Y./Region# 0.15562125 NA
## Culture#Arts# 0.12297120 NA
## #Opinion#RoomForDebate 0.15567171 NA
## Styles##Fashion 0.08611350 NA
## Business#BusinessDay#SmallBusiness 0.13040079 NA
## myOther 0.11273415 NA
## Travel#Travel# 0.08192542 NA
## Culture## NA NA
## Foreign#World#AsiaPacific 0.10367656 NA
## #Multimedia# 0.09072323 NA
## TStyle## 0.06994744 NA
## #U.S.#Education 0.06368409 NA
## Foreign#World# 0.06983757 NA
## err.abs.trn.mean err.abs.OOB.sum
## OpEd#Opinion# 0.33251375 46.5657806
## #Opinion#ThePublicEditor 0.32998288 2.0095642
## Styles#U.S.# 0.45567870 23.6233012
## Business#Crosswords/Games# 0.23852271 8.3810907
## Science#Health# 0.38793668 22.1467580
## Business#Technology# 0.23001139 29.7301209
## ## 0.13393000 78.0396865
## Business#BusinessDay#Dealbook 0.15397456 66.1858222
## Metro#N.Y./Region# 0.14888110 13.4646433
## Culture#Arts# 0.11214813 34.6762293
## #Opinion#RoomForDebate 0.06272545 3.7374728
## Styles##Fashion 0.03797197 2.1602732
## Business#BusinessDay#SmallBusiness 0.08673011 5.6285877
## myOther 0.03739458 0.5676045
## Travel#Travel# 0.03739970 3.6267258
## Culture## 0.06664375 0.1027346
## Foreign#World#AsiaPacific 0.04536221 5.3397418
## #Multimedia# 0.04594878 4.8380137
## TStyle## 0.02965505 9.5104193
## #U.S.#Education 0.01357338 6.0190802
## Foreign#World# 0.01777674 3.1966422
## err.abs.fit.sum err.abs.new.sum
## OpEd#Opinion# 169.951607 NA
## #Opinion#ThePublicEditor 7.268793 NA
## Styles#U.S.# 62.316078 NA
## Business#Crosswords/Games# 37.669866 NA
## Science#Health# 67.279323 NA
## Business#Technology# 46.853632 NA
## ## 132.849946 NA
## Business#BusinessDay#Dealbook 96.801881 NA
## Metro#N.Y./Region# 19.919519 NA
## Culture#Arts# 60.255887 NA
## #Opinion#RoomForDebate 6.538212 NA
## Styles##Fashion 8.955804 NA
## Business#BusinessDay#SmallBusiness 13.040079 NA
## myOther 3.720227 NA
## Travel#Travel# 6.799810 NA
## Culture## NA NA
## Foreign#World#AsiaPacific 15.551484 NA
## #Multimedia# 8.346537 NA
## TStyle## 43.577253 NA
## #U.S.#Education 15.475235 NA
## Foreign#World# 8.939209 NA
## err.abs.trn.sum
## OpEd#Opinion# 174.90223507
## #Opinion#ThePublicEditor 6.59965754
## Styles#U.S.# 80.65513073
## Business#Crosswords/Games# 29.33829341
## Science#Health# 76.03558884
## Business#Technology# 77.97386000
## ## 171.96611762
## Business#BusinessDay#Dealbook 146.58378254
## Metro#N.Y./Region# 29.47845712
## Culture#Arts# 75.69998885
## #Opinion#RoomForDebate 3.88897815
## Styles##Fashion 4.51866405
## Business#BusinessDay#SmallBusiness 12.14221607
## myOther 1.42099408
## Travel#Travel# 4.37576471
## Culture## 0.06664375
## Foreign#World#AsiaPacific 9.20852860
## #Multimedia# 6.47877859
## TStyle## 21.47025749
## #U.S.#Education 4.41134846
## Foreign#World# 3.05759995
## .freqRatio.Fit .freqRatio.OOB .freqRatio.Tst .n.Fit
## NA 1.000000 1.000000 NA
## .n.New.N .n.New.Y .n.OOB .n.Trn.N
## NA NA 1728.000000 5439.000000
## .n.Trn.Y .n.Tst .n.fit .n.new
## NA 1870.000000 NA 1870.000000
## .n.trn err.abs.OOB.mean err.abs.fit.mean err.abs.new.mean
## 6532.000000 4.690311 NA NA
## err.abs.trn.mean err.abs.OOB.sum err.abs.fit.sum err.abs.new.sum
## 3.004762 369.550293 NA NA
## err.abs.trn.sum
## 940.272886
## All.X__rcv_glmnet.imp
## PubDate.day.minutes.poly.1 100.000000
## PubDate.day.minutes.poly.4 46.372779
## NDSSName.my.fctrOpEd#Opinion# 29.922243
## NDSSName.my.fctrBusiness#Crosswords/Games# 27.460942
## PubDate.day.minutes.poly.2 25.434881
## NDSSName.my.fctrScience#Health# 25.239681
## NDSSName.my.fctr#Opinion#ThePublicEditor 25.093700
## NDSSName.my.fctrStyles#U.S.# 23.692755
## PubDate.wkend 7.735688
## WordCount.log1p 7.679961
## PubDate.hour.fctr(15.3,23] 6.634766
## WordCount.root2 6.473412
## PubDate.last4.log1p 6.419482
## PubDate.last2.log1p 6.356495
## PubDate.last8.log1p 6.260793
## PubDate.day.minutes.poly.3 6.246564
## NDSSName.my.fctrBusiness#Technology# 6.246564
## PubDate.hour.fctr(7.67,15.3] 6.246564
## PubDate.last16.log1p 6.246564
## PubDate.wkday.fctr1 6.246564
## PubDate.last32.log1p 6.246564
## PubDate.date.fctr(7,13] 6.246564
## .rnorm 6.246564
## NDSSName.my.fctrCulture## 6.246564
## NDSSName.my.fctrMetro#N.Y./Region# 6.246564
## PubDate.date.fctr(19,25] 6.246564
## PubDate.date.fctr(25,31] 6.246564
## PubDate.day.minutes.poly.5 6.246564
## PubDate.juliandate 6.246564
## PubDate.minute.fctr(14.8,29.5] 6.246564
## PubDate.minute.fctr(44.2,59.1] 6.246564
## PubDate.month.fctr10 6.246564
## PubDate.month.fctr12 6.246564
## PubDate.second.fctr(14.8,29.5] 6.246564
## PubDate.second.fctr(29.5,44.2] 6.246564
## PubDate.wkday.fctr3 6.246564
## PubDate.wkday.fctr4 6.246564
## WordCount.nexp 6.246564
## PubDate.wkday.fctr2 6.246564
## PubDate.date.fctr(13,19] 6.246564
## PubDate.month.fctr11 6.246564
## PubDate.second.fctr(44.2,59.1] 6.246564
## PubDate.wkday.fctr6 6.246564
## PubDate.wkday.fctr5 6.246564
## PubDate.minute.fctr(29.5,44.2] 6.246564
## NDSSName.my.fctr#Multimedia# 5.761664
## NDSSName.my.fctrTravel#Travel# 4.954982
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 4.834346
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 4.622150
## NDSSName.my.fctrCulture#Arts# 4.534053
## Final__rcv_glmnet.imp
## PubDate.day.minutes.poly.1 90.40903
## PubDate.day.minutes.poly.4 32.18318
## NDSSName.my.fctrOpEd#Opinion# 33.93385
## NDSSName.my.fctrBusiness#Crosswords/Games# 31.44233
## PubDate.day.minutes.poly.2 100.00000
## NDSSName.my.fctrScience#Health# 28.67549
## NDSSName.my.fctr#Opinion#ThePublicEditor 30.81095
## NDSSName.my.fctrStyles#U.S.# 27.49488
## PubDate.wkend 16.18021
## WordCount.log1p 16.43344
## PubDate.hour.fctr(15.3,23] 14.55456
## WordCount.root2 14.82783
## PubDate.last4.log1p 14.55456
## PubDate.last2.log1p 14.55456
## PubDate.last8.log1p 14.55456
## PubDate.day.minutes.poly.3 25.72759
## NDSSName.my.fctrBusiness#Technology# 17.06756
## PubDate.hour.fctr(7.67,15.3] 15.42835
## PubDate.last16.log1p 15.05236
## PubDate.wkday.fctr1 15.02383
## PubDate.last32.log1p 14.63405
## PubDate.date.fctr(7,13] 14.56304
## .rnorm 14.55456
## NDSSName.my.fctrCulture## 14.55456
## NDSSName.my.fctrMetro#N.Y./Region# 14.55456
## PubDate.date.fctr(19,25] 14.55456
## PubDate.date.fctr(25,31] 14.55456
## PubDate.day.minutes.poly.5 14.55456
## PubDate.juliandate 14.55456
## PubDate.minute.fctr(14.8,29.5] 14.55456
## PubDate.minute.fctr(44.2,59.1] 14.55456
## PubDate.month.fctr10 14.55456
## PubDate.month.fctr12 14.55456
## PubDate.second.fctr(14.8,29.5] 14.55456
## PubDate.second.fctr(29.5,44.2] 14.55456
## PubDate.wkday.fctr3 14.55456
## PubDate.wkday.fctr4 14.55456
## WordCount.nexp 14.55456
## PubDate.wkday.fctr2 14.50995
## PubDate.date.fctr(13,19] 14.31658
## PubDate.month.fctr11 14.24288
## PubDate.second.fctr(44.2,59.1] 14.23286
## PubDate.wkday.fctr6 14.02941
## PubDate.wkday.fctr5 13.90528
## PubDate.minute.fctr(29.5,44.2] 13.84506
## NDSSName.my.fctr#Multimedia# 10.67411
## NDSSName.my.fctrTravel#Travel# 12.16166
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 11.40620
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 13.73612
## NDSSName.my.fctrCulture#Arts# 14.24518
## [1] "glbObsNew prediction stats:"
##
## N Y
## 1186 684
## label step_major step_minor label_minor bgn end
## 16 predict.data.new 8 0 0 458.798 479.856
## 17 display.session.info 9 0 0 479.857 NA
## elapsed
## 16 21.059
## 17 NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor label_minor bgn
## 10 fit.models 6 0 0 63.608
## 14 fit.data.training 7 0 0 380.488
## 11 fit.models 6 1 1 328.370
## 9 select.features 5 0 0 37.872
## 16 predict.data.new 8 0 0 458.798
## 12 fit.models 6 2 2 359.525
## 1 import.data 1 0 0 9.601
## 15 fit.data.training 7 1 1 448.725
## 5 extract.features 3 0 0 28.037
## 13 fit.models 6 3 3 374.443
## 2 inspect.data 2 0 0 23.864
## 8 partition.data.training 4 0 0 36.455
## 6 manage.missing.data 3 1 1 34.987
## 3 scrub.data 2 1 1 26.894
## 7 cluster.data 3 2 2 36.130
## 4 transform.data 2 2 2 27.724
## end elapsed duration
## 10 328.369 264.762 264.761
## 14 448.724 68.236 68.236
## 11 359.524 31.155 31.154
## 9 63.608 25.736 25.736
## 16 479.856 21.059 21.058
## 12 374.443 14.918 14.918
## 1 23.864 14.263 14.263
## 15 458.797 10.072 10.072
## 5 34.987 6.950 6.950
## 13 380.488 6.045 6.045
## 2 26.893 3.029 3.029
## 8 37.871 1.417 1.416
## 6 36.130 1.143 1.143
## 3 27.723 0.830 0.829
## 7 36.454 0.324 0.324
## 4 28.037 0.313 0.313
## [1] "Total Elapsed Time: 479.856 secs"
## label step_major step_minor
## 9 fit.models_0_Low.cor.X 1 8
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3
## 7 fit.models_0_Max.cor.Y.Time.Lag 1 6
## 6 fit.models_0_Max.cor.Y.Time.Poly 1 5
## 5 fit.models_0_Max.cor.Y[rcv.1X1.cp.0|] 1 4
## 8 fit.models_0_Interact.High.cor.Y 1 7
## 3 fit.models_0_Random 1 2
## 2 fit.models_0_MFO 1 1
## 1 fit.models_0_bgn 1 0
## label_minor bgn end elapsed duration
## 9 glmnet 243.470 328.355 84.885 84.885
## 4 glmnet 72.630 150.481 77.851 77.851
## 7 glmnet 184.992 231.726 46.734 46.734
## 6 glmnet 165.179 184.992 19.813 19.813
## 5 rpart 150.482 165.178 14.696 14.696
## 8 glmnet 231.726 243.470 11.744 11.744
## 3 myrandom_classfr 68.049 72.629 4.580 4.580
## 2 myMFO_classfr 64.846 68.049 3.203 3.203
## 1 setup 64.812 64.845 0.033 0.033
## [1] "Total Elapsed Time: 328.355 secs"